Python Registry примеры использования

Язык программирования: Python

Пространство имен/Пакет: propnet.core.registry

Класс/Тип: Registry

Примеров на hotexamples.com: 16

Python Registry - 16 примеров найдено. Это лучшие примеры Python кода для propnet.core.registry.Registry, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Registry(30)

clear_all_registries(8)

evaluate(1)

title(1)

Пример #1

Показать файл

Файл: layouts_correlate.py Проект: shyshy903/propnet

    def populate_point_information(prop_x, prop_y, current_func):
        if not (prop_x and prop_y):
            raise PreventUpdate

        prop_x_name = Registry("symbols")[prop_x].display_names[0]
        prop_y_name = Registry("symbols")[prop_y].display_names[0]

        data = list(
            store.query(criteria={
                'property_x': prop_x,
                'property_y': prop_y
            }))

        path_length = data[0]['shortest_path_length']
        if path_length is None:
            path_text = "not connected"
        elif path_length == 0:
            path_text = "properties are the same"
        else:
            path_text = f"separated by {path_length} model"
            if path_length > 1:
                path_text += "s"
        point_text = dcc.Markdown(f"""
##### Point information
**x-axis property:** {prop_x_name}

**y-axis property:** {prop_y_name}

**distance apart on graph:** {path_text}

**number of data points:** {data[0]['n_points']}
""")

        # This ensures we know the ordering of the rows
        correlation_data = {
            d['correlation_func']: {
                'Correlation Function':
                correlation_func_info[d['correlation_func']]["name"],
                'Correlation Value':
                f"{d['correlation']:0.5f}"
            }
            for d in data
        }
        correlation_data = [
            correlation_data[func] for func in correlation_funcs
        ]

        correlation_table = dt.DataTable(
            id='corr-table',
            data=correlation_data,
            columns=[{
                'id': val,
                'name': val
            } for val in ('Correlation Function', 'Correlation Value')],
            editable=False,
            style_data_conditional=[{
                'if': {
                    'row_index': correlation_funcs.index(current_func)
                },
                "backgroundColor": "#3D9970",
                'color': 'white'
            }],
            style_cell={
                'font-family': 'HelveticaNeue',
                'text-align': 'left'
            },
            style_header={
                'fontWeight': 'bold',
                'font-family': 'HelveticaNeue',
                'text-align': 'left'
            })
        link_to_plot = dcc.Link("View the data plot",
                                href=f'/plot?x={prop_x}&y={prop_y}')
        return [point_text, correlation_table, link_to_plot], True

Пример #2

Показать файл

Файл: test_storage.py Проект: shiva1387/propnet

 def tearDownClass(cls):
     non_builtin_syms = [k for k, v in Registry("symbols").items() if not v.is_builtin]
     for sym in non_builtin_syms:
         Registry("symbols").pop(sym)

Пример #3

Показать файл

Файл: test_storage.py Проект: shiva1387/propnet

    def setUpClass(cls):
        add_builtin_models_to_registry()
        # Inspiration was taken from the GraphTest class
        # I tried to construct the dictionaries for comparison
        # without writing out every one explicitly by reusing
        # information where it was applicable.
        # If this is too unreadable, can change to writing it
        # out explicitly in a JSON file and importing it. Would
        # still need to replace some fields dynamically.
        symbols = StorageTest.generate_symbols()

        cls.custom_syms_as_dicts = {
            k: {'@module': 'propnet.core.symbols',
                '@class': 'Symbol',
                'name': k,
                'display_names': [k],
                'display_symbols': [k],
                'units': (1, ()),
                'shape': 1,
                'object_type': None,
                'comment': None,
                'category': 'property',
                'constraint': None,
                'default_value': None,
                'is_builtin': False} for k in ['A', 'B', 'C']
        }
        cls.custom_syms_as_dicts['C'].update(
            {"units": None,
             "shape": None,
             "object_type": "str",
             "category": "object"})

        cls.custom_symbols_json = copy.deepcopy(cls.custom_syms_as_dicts)
        for k in ['A', 'B']:
            cls.custom_symbols_json[k]['units'] = [1, []]

        a = [QuantityFactory.create_quantity(symbols['A'], 19),
             QuantityFactory.create_quantity(symbols['A'], 23)]
        b = [QuantityFactory.create_quantity(symbols['B'], 38,
                                             provenance=ProvenanceElement(model='model1',
                                                                          inputs=[a[0]])),
             QuantityFactory.create_quantity(symbols['B'], 46,
                                             provenance=ProvenanceElement(model='model1',
                                                                          inputs=[a[1]]))]
        cls.quantities_custom_symbol = {"A": a,
                                         "B": b}

        cls.sq_custom_sym_as_dicts = {
            k: [{'@module': 'propnet.dbtools.storage',
                 '@class': 'StorageQuantity',
                 'internal_id': vv._internal_id,
                 'data_type': 'NumQuantity',
                 'symbol_type': symbols[k],
                 'value': vv.magnitude,
                 'units': 'dimensionless',
                 'provenance': ProvenanceStore.from_provenance_element(vv.provenance),
                 'tags': [],
                 'uncertainty': None} for vv in v] for k, v in cls.quantities_custom_symbol.items()
        }

        provenances_json = {
            "A": [{'@module': 'propnet.dbtools.storage',
                   '@class': 'ProvenanceStore',
                   'model': None,
                   'inputs': None,
                   'source': aa.provenance.source} for aa in a]}
        provenances_json['B'] = [
            {'@module': 'propnet.dbtools.storage',
             '@class': 'ProvenanceStore',
             'model': 'model1',
             'inputs': [{'@module': 'propnet.dbtools.storage',
                         '@class': 'ProvenanceStoreQuantity',
                         'data_type': 'NumQuantity',
                         'symbol_type': cls.custom_symbols_json['A'],
                         'internal_id': q.provenance.inputs[0]._internal_id,
                         'tags': [],
                         'provenance': p}],
             'source': q.provenance.source} for q, p in zip(b, provenances_json['A'])]

        cls.sq_custom_sym_json = copy.deepcopy(cls.sq_custom_sym_as_dicts)
        for sym in ['A', 'B']:
            for q, p in zip(cls.sq_custom_sym_json[sym], provenances_json[sym]):
                q['symbol_type'] = cls.custom_symbols_json[sym]
                q['provenance'] = p

        band_gaps = [QuantityFactory.create_quantity('band_gap', 3.3, 'eV'),
                     QuantityFactory.create_quantity('band_gap', 2.1, 'eV')]

        bg_ri_model = Registry("models")['band_gap_refractive_index_moss']
        refractive_indices = [bg_ri_model.evaluate({"Eg": bg}).pop('refractive_index') for bg in band_gaps]

        cls.quantities_canonical_symbol = {"band_gaps": band_gaps,
                                            "refractive_indices": refractive_indices}

        cls.sq_canonical_sym_as_dicts_no_value = copy.deepcopy(cls.sq_custom_sym_as_dicts)
        cls.sq_canonical_sym_as_dicts_no_value['band_gaps'] = cls.sq_canonical_sym_as_dicts_no_value.pop('A')
        cls.sq_canonical_sym_as_dicts_no_value['refractive_indices'] = cls.sq_canonical_sym_as_dicts_no_value.pop('B')

        for d, sq in zip(cls.sq_canonical_sym_as_dicts_no_value['band_gaps'], band_gaps):
            d.update({
                "internal_id": sq._internal_id,
                "symbol_type": "band_gap",
                "units": "electron_volt",
                "provenance": ProvenanceStore.from_provenance_element(sq.provenance)
            })
            d.pop('value')

        for d, sq in zip(cls.sq_canonical_sym_as_dicts_no_value['refractive_indices'], refractive_indices):
            d.update({
                "internal_id": sq._internal_id,
                "symbol_type": "refractive_index",
                "units": "dimensionless",
                "provenance": ProvenanceStore.from_provenance_element(sq.provenance)
            })
            d.pop('value')

        cls.sq_canonical_sym_values = {"band_gaps": [3.3, 2.1],
                                        "refractive_indices": [2.316340583741216, 2.593439239956374]}

        provenances_json['band_gaps'] = [
            {'@module': 'propnet.dbtools.storage',
             '@class': 'ProvenanceStore',
             'model': None,
             'inputs': None,
             'source': bg.provenance.source}
            for bg in band_gaps
        ]

        provenances_json['refractive_indices'] = [{
            '@module': 'propnet.dbtools.storage',
            '@class': 'ProvenanceStore',
            'model': 'band_gap_refractive_index_moss',
            'inputs': [{'@module': 'propnet.dbtools.storage',
                        '@class': 'ProvenanceStoreQuantity',
                        'data_type': 'NumQuantity',
                        'symbol_type': 'band_gap',
                        'internal_id': bg._internal_id,
                        'tags': [],
                        'provenance': pj}],
            'source': ri.provenance.source}
            for bg, pj, ri in zip(band_gaps,
                                  provenances_json['band_gaps'],
                                  refractive_indices)
        ]

        cls.sq_canonical_sym_json_no_value = copy.deepcopy(cls.sq_canonical_sym_as_dicts_no_value)

        for sym in ["band_gaps", "refractive_indices"]:
            for q, p in zip(cls.sq_canonical_sym_json_no_value[sym], provenances_json[sym]):
                q['provenance'] = p

        cls.quantity_with_uncertainty = NumQuantity.from_weighted_mean(b)
        cls.sq_with_uncertainty_as_dict_no_numbers = {
            '@module': 'propnet.dbtools.storage',
            '@class': 'StorageQuantity',
            'internal_id': cls.quantity_with_uncertainty._internal_id,
            'data_type': 'NumQuantity',
            'symbol_type': symbols['B'],
            'units': 'dimensionless',
            'provenance': ProvenanceStore.from_provenance_element(
                cls.quantity_with_uncertainty.provenance),
            'tags': []}

        provenances_json = {
            '@module': 'propnet.dbtools.storage',
            '@class': 'ProvenanceStore',
            'model': 'aggregation',
            'inputs': [
                {'@module': 'propnet.dbtools.storage',
                 '@class': 'ProvenanceStoreQuantity',
                 'data_type': 'NumQuantity',
                 'symbol_type': cls.custom_symbols_json['B'],
                 'internal_id': b['internal_id'],
                 'tags': [],
                 'provenance': b['provenance']}
                for b in cls.sq_custom_sym_json['B']],
            'source': cls.quantity_with_uncertainty.provenance.source
        }

        cls.sq_with_uncertainty_json_no_numbers = copy.deepcopy(cls.sq_with_uncertainty_as_dict_no_numbers)
        cls.sq_with_uncertainty_json_no_numbers.update({"symbol_type": cls.custom_symbols_json['B'],
                                                         "provenance": provenances_json})
        cls.sq_with_uncertainty_numbers = {"value": 42.0,
                                            "uncertainty": 4.0}

        obj_symbol = symbols['C']
        cls.object_quantity = QuantityFactory.create_quantity(obj_symbol, "Test string")
        cls.sq_object_as_dict = copy.deepcopy(cls.sq_custom_sym_as_dicts['A'][0])
        cls.sq_object_as_dict.update({
            "data_type": "ObjQuantity",
            "symbol_type": symbols['C'],
            "internal_id": cls.object_quantity._internal_id,
            "value": "Test string",
            "units": None,
            "provenance": ProvenanceStore.from_provenance_element(cls.object_quantity.provenance)
        })
        cls.sq_object_json = copy.deepcopy(cls.sq_object_as_dict)
        cls.sq_object_json.update(
            {"symbol_type": cls.custom_syms_as_dicts['C'],
             "provenance": {'@module': 'propnet.dbtools.storage',
                            '@class': 'ProvenanceStore',
                            'model': None,
                            'inputs': None,
                            'source': cls.object_quantity.provenance.source}}
        )

        # This setting allows dict differences to be shown in full
        cls.maxDiff = None

Пример #4

Показать файл

import logging

logger = logging.getLogger(__name__)

mpr = MPRester()

try:
    store = loadfn(environ["PROPNET_STORE_FILE"])
    store.connect()
except (ServerSelectionTimeoutError, KeyError):
    from maggma.stores import MemoryStore
    store = MemoryStore()
    store.connect()
    # layout won't work if database is down, but at least web app will stay up
    scalar_symbols = {k: v for k, v in Registry("symbols").items()
                      if (v.category == 'property' and v.shape == 1)}
    warning_layout = html.Div('No database connection could be established.',
                              style={'font-family': 'monospace',
                                     'color': 'rgb(211, 84, 0)',
                                     'text-align': 'left',
                                     'font-size': '1.2em'})
else:
    cut_off = 100  # need at least this many available quantities for plot
    """
    scalar_symbols = {k: v for k, v in Registry("symbols").items()
                      if (v.category == 'property' and v.shape == 1
                          and store.query(
                              criteria={f'{k}.mean': {'$exists': True}}).count() > cut_off)}
    """
    scalar_symbols = {

Пример #5

Показать файл

Файл: __init__.py Проект: shyshy903/propnet

def _update_globals():
    for name, model in Registry("models").items():
        if model.is_builtin:
            globals()[name] = model

Пример #6

Показать файл

 def tearDownClass(cls):
     Registry.clear_all_registries()

Пример #7

Показать файл

 def setUpClass(cls):
     Registry.clear_all_registries()

Пример #8

Показать файл

from propnet.ext.matproj import MPRester
from propnet.ext.aflow import AflowAdapter

MPR = MPRester()
AFA = AflowAdapter()
graph_evaluator = Graph(parallel=True, max_workers=4)

# explicitly making this an OrderedDict so we can go back from the
# display name to the symbol name
# Removed condition symbols from table until we can handle combinatorics blow-up that results
# from adding a temperature -cml
# TODO: Add condition symbols back when combinartorics problem solved
SCALAR_SYMBOLS = OrderedDict({
    k: v
    for k, v in sorted(Registry("symbols").items(),
                       key=lambda x: x[1].display_names[0])
    if (v.category == 'property' and v.shape == 1)
})
ROW_IDX_TO_SYMBOL_NAME = [symbol for symbol in SCALAR_SYMBOLS.keys()]

DEFAULT_ROWS = [{
    'Property': symbol.display_names[0],
    'Editable Value': ""
} for symbol in SCALAR_SYMBOLS.values()]

REMAINING_SYMBOLS = OrderedDict({
    k: v
    for k, v in sorted(Registry("symbols").items(),
                       key=lambda x: x[1].display_names[0])
    if not ((v.category == 'property' or v.category == 'condition')

Пример #9

Показать файл

    def evaluate(input_rows, data, aggregate):

        quantities = []

        for idx, row in enumerate(input_rows):
            if row['Editable Value']:
                try:
                    value = ureg.parse_expression(row['Editable Value'])
                    units = Registry("units").get(ROW_IDX_TO_SYMBOL_NAME[idx])
                    value.ito(units)
                except Exception:
                    # Someone put an invalid value in the table
                    # TODO: Make error known to the user
                    raise PreventUpdate
                q = QuantityFactory.create_quantity(
                    symbol_type=ROW_IDX_TO_SYMBOL_NAME[idx], value=value)
                quantities.append(q)

        if data and len(data) > 0:
            quantities += json.loads(data, cls=MontyDecoder).values()

        if not quantities:
            raise PreventUpdate

        material = Material()

        for quantity in quantities:
            material.add_quantity(quantity)

        output_material = graph_evaluator.evaluate(material, timeout=5)

        if aggregate:
            aggregated_quantities = output_material.get_aggregated_quantities()
            non_aggregatable_quantities = [
                v for v in output_material.get_quantities()
                if v.symbol not in aggregated_quantities
            ]
            output_quantities = list(
                aggregated_quantities.values()) + non_aggregatable_quantities
        else:
            output_quantities = output_material.get_quantities()

        output_rows = [{
            'Property': quantity.symbol.display_names[0],
            'Value': quantity.pretty_string(sigfigs=3)
        } for quantity in output_quantities]

        output_table = dt.DataTable(id='output-table',
                                    data=output_rows,
                                    columns=[{
                                        'id': val,
                                        'name': val
                                    } for val in ('Property', 'Value')],
                                    editable=False,
                                    **DATA_TABLE_STYLE)

        # TODO: clean up

        input_quantity_names = [q.symbol for q in quantities]
        derived_quantity_names = \
            set([q.symbol for q in output_quantities]) - \
            set(input_quantity_names)

        models_evaluated = set(
            output_q.provenance.model
            for output_q in output_material.get_quantities())
        models_evaluated = [
            Registry("models").get(m) for m in models_evaluated
            if Registry("models").get(m) is not None
        ]

        material_graph_data = graph_conversion(
            propnet_nx_graph,
            derivation_pathway={
                'inputs': input_quantity_names,
                'outputs': list(derived_quantity_names),
                'models': models_evaluated
            })

        output_graph = html.Div(children=[
            dcc.Checklist(id='material-graph-options',
                          options=[{
                              'label': 'Show models',
                              'value': 'show_models'
                          }, {
                              'label': 'Show properties',
                              'value': 'show_properties'
                          }],
                          value=['show_properties'],
                          labelStyle={'display': 'inline-block'}),
            Cytoscape(id='material-graph',
                      elements=material_graph_data,
                      stylesheet=GRAPH_STYLESHEET,
                      layout=GRAPH_LAYOUT_CONFIG,
                      **GRAPH_SETTINGS['full_view'])
        ])

        return [output_graph, html.Br(), output_table]

Пример #10

Показать файл

Файл: correlation.py Проект: shyshy903/propnet

    def get_data_from_full_db(self, prop_x, prop_y):
        """
        Collects scalar data from full propnet database, aggregates it by property,
        and samples it if desired.

        Args:
            prop_x (str): name of property x
            prop_y (str): name of property y

        Returns:
            dict: dictionary of data keyed by property name

        """

        # Get all materials which have both properties in the inputs or outputs
        criteria = {
            '$and': [{
                '$or': [{
                    'inputs.symbol_type': prop_x
                }, {
                    prop_x: {
                        '$exists': True
                    }
                }]
            }, {
                '$or': [{
                    'inputs.symbol_type': prop_y
                }, {
                    prop_y: {
                        '$exists': True
                    }
                }]
            }]
        }
        properties = [prop_x + '.quantities', prop_y + '.quantities', 'inputs']

        if self.sample_size is None:
            pn_data = self.propnet_store.query(criteria=criteria,
                                               properties=properties)
        else:
            pipeline = [
                {
                    '$match': criteria
                },
                {
                    '$sample': {
                        'size': self.sample_size
                    }
                },
                {
                    '$project': {p: True
                                 for p in properties}
                },
            ]
            pn_data = self.propnet_store.collection.aggregate(
                pipeline, allowDiskUse=True)

        x_unit = Registry("units")[prop_x]
        y_unit = Registry("units")[prop_y]
        data = defaultdict(list)
        for material in pn_data:
            # Collect all data with units for this material
            # and calculate the mean, convert units, store magnitude of mean
            if prop_x == prop_y:
                # This is to avoid duplicating the work and the data
                props = (prop_x, )
                units = (x_unit, )
            else:
                props = (prop_x, prop_y)
                units = (x_unit, y_unit)
            for prop, unit in zip(props, units):
                qs = [
                    ureg.Quantity(q['value'], q['units'])
                    for q in material['inputs'] if q['symbol_type'] == prop
                ]
                if prop in material:
                    qs.extend([
                        ureg.Quantity(q['value'], q['units'])
                        for q in material[prop]['quantities']
                    ])

                if len(qs) == 0:
                    raise ValueError("Query for property {} gave no results"
                                     "".format(prop))
                prop_mean = sum(qs) / len(qs)
                data[prop].append(prop_mean.to(unit).magnitude)

        return data

Пример #11

Показать файл

Файл: test_storage.py Проект: shyshy903/propnet

 def tearDownClass(cls):
     warnings.filterwarnings("default", category=UnitStrippedWarning)
     non_builtin_syms = [k for k, v in Registry("symbols").items() if not v.is_builtin]
     for sym in non_builtin_syms:
         Registry("symbols").pop(sym)

Пример #12

Показать файл

Файл: correlation.py Проект: shyshy903/propnet

class CorrelationBuilder(Builder):
    """
    A class to calculate the correlation between properties derived by or used in propnet
    using a suite of regression tools. Uses the Builder architecture for optional parallel
    processing of data.

    Note: serialization of builder does not work with custom correlation functions, although
    interactive use does support them.

    """
    PROPNET_PROPS = [
        v.name for v in Registry("symbols").values()
        if (v.category == 'property' and v.shape == 1)
    ]

    def __init__(self,
                 propnet_store,
                 correlation_store,
                 out_file=None,
                 funcs='linlsq',
                 props=None,
                 sample_size=None,
                 from_quantity_db=True,
                 **kwargs):
        """
        Constructor for the correlation builder.

        Args:
            propnet_store (Mongolike Store): store instance pointing to propnet collection
                with read access
            correlation_store (Mongolike Store): store instance pointing to collection with write access
            out_file (str): optional, filename to output data in JSON format (useful if using a MemoryStore
                for correlation_store)
            funcs (`str`, `callable`, list of `str` or `callable`) functions to use for correlation.
                Built-in functions can be specified by the following strings:

                linlsq (default): linear least-squares, reports R^2
                pearson: Pearson r-correlation, reports r
                spearman: Spearman rank correlation, reports r
                mic: maximal-information non-parametric exploration, reports maximal information coefficient
                ransac: random sample consensus (RANSAC) regression, reports score
                theilsen: Theil-Sen regression, reports score
                all: runs all correlation functions above
            props (`list` of `str`): optional, list of properties for which to calculate the correlation.
                Default is to calculate for all possible pairs (props=None)
            sample_size (int): optional, limits correlation calculation data to a random sample of size
                `sample_size`. Default: None (no limit)
            from_quantity_db (bool): True means propnet_store follows the quantity-indexed database
                schema, False means the full, material-indexed database schema. Note: querying quantity-indexed
                databases is considerably faster than material-indexed.
                Default: True (quantity schema)
            **kwargs: arguments to the Builder superclass
        """

        self.propnet_store = propnet_store
        self.from_quantity_db = from_quantity_db
        self.correlation_store = correlation_store
        self.out_file = out_file

        self._correlation_funcs = self.get_correlation_funcs()

        self._funcs = {}

        if not isinstance(funcs, list):
            funcs = [funcs]

        for f in funcs:
            if isinstance(f, str) and f == 'all':
                self._funcs.update(self._correlation_funcs)
            elif isinstance(f, str) and f in self._correlation_funcs.keys():
                self._funcs[f] = self._correlation_funcs[f]
            elif callable(f):
                name = f.__module__ + "." + f.__name__
                self._funcs[name] = f
            else:
                raise ValueError("Invalid correlation function: {}".format(f))

        if not self._funcs:
            raise ValueError("No valid correlation functions selected")

        self._props = props or self.PROPNET_PROPS

        if sample_size is not None and sample_size < 2:
            raise ValueError("Sample size must be greater than 1")
        self.sample_size = sample_size
        self.total = None

        super(CorrelationBuilder, self).__init__(sources=[propnet_store],
                                                 targets=[correlation_store],
                                                 **kwargs)

    @classmethod
    def get_correlation_funcs(cls):
        """
        Gets built-in correlation functions and their names.

        Returns:
            dict: dict of function handles keyed by name

        """
        return {
            f.replace('_cfunc_', ''): getattr(cls, f)
            for f in dir(cls)
            if re.match(r'^_cfunc_.+$', f) and callable(getattr(cls, f))
        }

    def get_items(self):
        """
        Accumulates data and generates data sets for pairs of properties coupled
        with correlation functions.

        Returns:
            (generator): yields dicts of data (see _make_data_combinations())
        """
        self.total = len(self._props)**2 * len(self._funcs)

        # combinations_with_replacement() produces all possible pairs of properties
        # without repeating, i.e. will give AB but not BA. Code below manually
        # produces "BA" so that we don't have to re-query the database.
        for prop_x, prop_y in combinations_with_replacement(self._props, 2):
            if self.from_quantity_db:
                data = self.get_data_from_quantity_db(
                    self.propnet_store,
                    prop_x,
                    prop_y,
                    sample_size=self.sample_size)
            else:
                data = self.get_data_from_full_db(prop_x, prop_y)

            yield from self._make_data_combinations(prop_x, prop_y, data)

    @staticmethod
    def get_data_from_quantity_db(store,
                                  *props,
                                  sample_size=None,
                                  include_id=False):
        """
        Collects scalar data from the quantity-onlu propnet database,
        aggregates it by material and property, and samples it if desired.

        Args:
            store (maggma.stores.Store): MongoDB store instance for quantity databse
            *props (str): property names as strings
            sample_size (int): If specified, limits the number of returned records
                to sample_size, randomly selected. If total of records is less than
                sample_size, only those records are returned. Default: None (all records)
            include_id (bool): True includes the '_id' field, which contains the material
                key for the record. Default: False (do not include the field)

        Returns:
            dict: dictionary of data keyed by property name

        """

        # This aggregation query collects the quantities, groups them by material
        # and averages the values for that material, then samples them (if specified)
        match_stage = {
            '$match': {
                '$or': [{
                    'symbol_type': prop
                } for prop in props]
            }
        }
        group_stage = {'$group': {'_id': '$material_key'}}
        for prop in props:
            group_stage['$group'].update({
                prop: {
                    '$avg': {
                        '$cond': [{
                            "$eq": ['$symbol_type', prop]
                        }, '$value', None]
                    }
                }
            })
        pipeline = [match_stage, group_stage]

        if sample_size is not None:
            pipeline.append({'$sample': {'size': sample_size}})

        query = store.collection.aggregate(pipeline=pipeline,
                                           allowDiskUse=True)

        data = defaultdict(list)
        for m in query:
            if all(m[prop] is not None and np.isfinite(m[prop])
                   for prop in props):
                for prop in props:
                    data[prop].append(m[prop])
                if include_id:
                    data['_id'].append(m['_id'])

        return dict(data)

    def get_data_from_full_db(self, prop_x, prop_y):
        """
        Collects scalar data from full propnet database, aggregates it by property,
        and samples it if desired.

        Args:
            prop_x (str): name of property x
            prop_y (str): name of property y

        Returns:
            dict: dictionary of data keyed by property name

        """

        # Get all materials which have both properties in the inputs or outputs
        criteria = {
            '$and': [{
                '$or': [{
                    'inputs.symbol_type': prop_x
                }, {
                    prop_x: {
                        '$exists': True
                    }
                }]
            }, {
                '$or': [{
                    'inputs.symbol_type': prop_y
                }, {
                    prop_y: {
                        '$exists': True
                    }
                }]
            }]
        }
        properties = [prop_x + '.quantities', prop_y + '.quantities', 'inputs']

        if self.sample_size is None:
            pn_data = self.propnet_store.query(criteria=criteria,
                                               properties=properties)
        else:
            pipeline = [
                {
                    '$match': criteria
                },
                {
                    '$sample': {
                        'size': self.sample_size
                    }
                },
                {
                    '$project': {p: True
                                 for p in properties}
                },
            ]
            pn_data = self.propnet_store.collection.aggregate(
                pipeline, allowDiskUse=True)

        x_unit = Registry("units")[prop_x]
        y_unit = Registry("units")[prop_y]
        data = defaultdict(list)
        for material in pn_data:
            # Collect all data with units for this material
            # and calculate the mean, convert units, store magnitude of mean
            if prop_x == prop_y:
                # This is to avoid duplicating the work and the data
                props = (prop_x, )
                units = (x_unit, )
            else:
                props = (prop_x, prop_y)
                units = (x_unit, y_unit)
            for prop, unit in zip(props, units):
                qs = [
                    ureg.Quantity(q['value'], q['units'])
                    for q in material['inputs'] if q['symbol_type'] == prop
                ]
                if prop in material:
                    qs.extend([
                        ureg.Quantity(q['value'], q['units'])
                        for q in material[prop]['quantities']
                    ])

                if len(qs) == 0:
                    raise ValueError("Query for property {} gave no results"
                                     "".format(prop))
                prop_mean = sum(qs) / len(qs)
                data[prop].append(prop_mean.to(unit).magnitude)

        return data

    def _make_data_combinations(self, prop_x, prop_y, data):
        """
        Generates combinations of properties and desired correlation functions for evaluation.

        Args:
            prop_x (str): name of property x
            prop_y (str): name of property y
            data (dict): dictionary of data keyed by property name

        Returns: (generator) a generator providing a dictionary with the data for correlation:
            {'x_data': (list<float>) data for independent property (x-axis),
             'x_name': (str) name of independent property,
             'y_data': (list<float>) data for dependent property (y-axis),
             'y_name': (str) name of dependent property,
             'func': (tuple<str, function>) name and function handle for correlation function
             }

        """
        # So we get AB and BA without re-querying, but not two AA
        if prop_x == prop_y:
            prop_combos = ((prop_x, prop_x), )
        else:
            prop_combos = ((prop_x, prop_y), (prop_y, prop_x))
        for x, y in prop_combos:
            for name, func in self._funcs.items():
                data_dict = {
                    'x_data': data.get(x, []),
                    'x_name': x,
                    'y_data': data.get(y, []),
                    'y_name': y,
                    'func': (name, func)
                }
                yield data_dict

    def process_item(self, item):
        """
        Run correlation calculation on a pair of properties using the specified function.

        Args:
            item: (dict) input provided by get_items() (see get_items() for structure)

        Returns: (tuple<str, str, float, str, int>) output of calculation with necessary
            information about calculation included. Format in tuple:
                independent property (x-axis) name,
                dependent property (y-axis) name,
                correlation value,
                correlation function name,
                number of data points used for correlation
                length of shortest path between properties on propnet graph where x-axis property
                    is starting property and y-axis property is ending property.
                    Note: if no (forward) connection exists, the path length will be None. This does
                    not preclude y->x having a forward path.

        """
        prop_x, prop_y = item['x_name'], item['y_name']
        data_x, data_y = item['x_data'], item['y_data']
        func_name, func = item['func']
        n_points = len(data_x)

        g = Graph()
        try:
            path_length_xy = g.get_degree_of_separation(prop_x, prop_y)
            path_length_yx = g.get_degree_of_separation(prop_y, prop_x)
        except ValueError:
            # This shouldn't happen...but just in case
            path_length_xy = None
            path_length_yx = None

        try:
            path_length = min(path_length_xy, path_length_yx)
        except TypeError:
            path_length = path_length_xy or path_length_yx

        if n_points < 2:
            result = 0.0
        else:
            try:
                result = func(data_x, data_y)
            except Exception as ex:
                # If correlation fails, catch the error, save it, and move on
                result = ex
        return prop_x, prop_y, result, func_name, n_points, path_length

    @staticmethod
    def _cfunc_mic(x, y):
        """
        Get maximal information coefficient for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) maximal information coefficient

        """
        from minepy import MINE
        m = MINE()
        m.compute_score(x, y)
        return m.mic()

    @staticmethod
    def _cfunc_linlsq(x, y):
        """
        Get R^2 value for linear least-squares fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) R^2 value

        """
        from scipy import stats
        fit = stats.linregress(x, y)
        return fit.rvalue**2

    @staticmethod
    def _cfunc_pearson(x, y):
        """
        Get R value for Pearson fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Pearson R value

        """
        from scipy import stats
        fit = stats.pearsonr(x, y)
        return fit[0]

    @staticmethod
    def _cfunc_spearman(x, y):
        """
        Get R value for Spearman fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Spearman R value

        """
        from scipy import stats
        fit = stats.spearmanr(x, y)
        return fit[0]

    @staticmethod
    def _cfunc_ransac(x, y):
        """
        Get random sample consensus (RANSAC) regression score for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) RANSAC score

        """
        from sklearn.linear_model import RANSACRegressor
        r = RANSACRegressor(random_state=21)
        x_coeff = np.array(x)[:, np.newaxis]
        r.fit(x_coeff, y)
        return r.score(x_coeff, y)

    @staticmethod
    def _cfunc_theilsen(x, y):
        """
        Get Theil-Sen regression score for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Theil-Sen score

        """
        from sklearn.linear_model import TheilSenRegressor
        r = TheilSenRegressor(random_state=21)
        x_coeff = np.array(x)[:, np.newaxis]
        r.fit(x_coeff, y)
        return r.score(x_coeff, y)

    def update_targets(self, items):
        """
        Write correlation data to Mongo store.

        Args:
            items: (list<dict>) list of results output by process_item()

        """
        data = []
        for item in items:
            prop_x, prop_y, result, func_name, n_points, path_length = item
            d = {
                'property_x': prop_x,
                'property_y': prop_y,
                'correlation_func': func_name,
                'n_points': n_points,
                'shortest_path_length': path_length,
                'id': hash((prop_x, prop_y)) ^ hash(func_name)
            }
            if not isinstance(result, Exception):
                d['correlation'] = result
            else:
                d['correlation'] = None
                d['error'] = (result.__class__.__name__, result.args)
            data.append(d)
        self.correlation_store.update(data, key='id')

    def finalize(self, cursor=None):
        """
        Outputs correlation data to JSON file, if specified in instantiation, and runs
        clean-up function for Builder.

        Args:
            cursor: (Mongo Store cursor) optional, cursor to close if not automatically closed.

        """

        props_to_index = [
            'property_x', 'property_y', 'correlation_func', 'correlation',
            'shortest_path_length'
        ]
        for prop in props_to_index:
            if not self.correlation_store.ensure_index(prop):
                logger.warning(
                    "Could not add index for property {}".format(prop))

        if self.out_file:
            try:
                self.write_correlation_data_file(self.out_file)
            except OSError:
                logger.warning(
                    "Cannot open file for writing! Skipping file writing.")

        super(CorrelationBuilder, self).finalize(cursor)

    def write_correlation_data_file(self, out_file):
        """
        Gets data dictionary containing correlation matrices and outputs to a file.

        Args:
            out_file: (str) file path and name for output to JSON file
        """
        matrix = self.get_correlation_matrices()
        with open(out_file, 'w') as f:
            json.dump(matrix, f)

    def get_correlation_matrices(self, func_name=None):
        """
        Builds document containing the correlation matrix with relevant data regarding
        correlation algorithm and properties of the data set.

        Args:
            func_name: (str) optional, name of the correlation functions to include in the document
                default: None, which is to include all that were run by this builder.

        Returns: (dict) document containing correlation data. Format:
            {'properties': (list<str>) names of properties calculated in order of how they are indexed
                    in the matrices
             'n_points': (list<list<int>>) list of lists (i.e. matrix) containing the number of data
                    points evaluated during the fitting procedure
             'correlation': (dict<str: list<list<float>>>) dictionary of matrices containing correlation
                    results, keyed by correlation function name
            }

        """

        prop_data = self.correlation_store.query(
            criteria={'property_x': {
                '$exists': True
            }},
            properties=['property_x'])
        props = list(set(item['property_x'] for item in prop_data))

        out = {
            'properties': props,
            'n_points': None,
            'shortest_path_length': None,
            'correlation': {}
        }

        if not func_name:
            func_name = list(self._funcs.keys())

        if isinstance(func_name, str):
            func_name = [func_name]

        for f in func_name:
            data = self.correlation_store.query(
                criteria={'correlation_func': f})
            corr_matrix: list = np.zeros(shape=(len(props),
                                                len(props))).tolist()

            fill_info_matrices = False
            if not out['n_points'] and not out['shortest_path_length']:
                fill_info_matrices = True
                out['n_points'] = np.zeros(shape=(len(props),
                                                  len(props))).tolist()
                out['shortest_path_length'] = np.zeros(
                    shape=(len(props), len(props))).tolist()

            for d in data:
                prop_x, prop_y, correlation, n_points, path_length = d['property_x'], \
                                                                     d['property_y'], \
                                                                     d['correlation'], \
                                                                     d['n_points'], \
                                                                     d['shortest_path_length']
                ia, ib = props.index(prop_x), props.index(prop_y)
                corr_matrix[ia][ib] = correlation

                if fill_info_matrices:
                    out['n_points'][ia][ib] = n_points
                    out['n_points'][ib][ia] = n_points
                    out['shortest_path_length'][ia][ib] = path_length

            out['correlation'][f] = corr_matrix

        return out

    def as_dict(self):
        """
        Returns the representation of the builder as a dictionary in JSON serializable format.
        Note: because functions are not JSON serializable, custom functions are omitted when
            serializing the object.

        Returns: (dict) representation of this builder as a JSON-serializable dictionary

        """
        d = super(CorrelationBuilder, self).as_dict()
        serialized_funcs = []
        for name in d['funcs'].keys():
            if name in self._correlation_funcs.keys():
                serialized_funcs.append(name)
            else:
                logger.warning(
                    "Cannot serialize custom function '{}'. Omitting.".format(
                        name))

        if not serialized_funcs:
            logger.warning(
                "No functions were able to be serialized from this builder.")

        d['funcs'] = serialized_funcs
        return d

Пример #13

Показать файл

Файл: test_aflow_ingester.py Проект: shyshy903/propnet

 def setUpClass(cls) -> None:
     Registry.clear_all_registries()
     add_builtin_symbols_to_registry()

Пример #14

Показать файл

    def process(self, item):
        if self.graph_parallel and not self.allow_child_process and \
                current_process().name != "MainProcess":
            logger.warning(
                "It appears derive_quantities() is running "
                "in a child process, possibly in a parallelized "
                "Runner.\nThis is not recommended and will deteriorate "
                "performance.")
        # Define quantities corresponding to materials doc fields
        # Attach quantities to materials
        item = MontyDecoder().process_decoded(item)
        logger.info("Populating material for %s", item['task_id'])
        material = Material()

        if 'created_at' in item.keys():
            date_created = item['created_at']
        else:
            date_created = None

        provenance = ProvenanceElement(
            source={
                "source": self.source_name,
                "source_key": item['task_id'],
                "date_created": date_created
            })

        for mkey, property_name in self.materials_symbol_map.items():
            value = pydash.get(item, mkey)
            if value:
                material.add_quantity(
                    QuantityFactory.create_quantity(
                        property_name,
                        value,
                        units=Registry("units").get(property_name, None),
                        provenance=provenance))

        # Add custom things, e. g. computed entry
        computed_entry = get_entry(item)
        if computed_entry:
            material.add_quantity(
                QuantityFactory.create_quantity("computed_entry",
                                                computed_entry,
                                                provenance=provenance))
        else:
            logger.info("Unable to create computed entry for {}".format(
                item['task_id']))
        material.add_quantity(
            QuantityFactory.create_quantity("external_identifier_mp",
                                            item['task_id'],
                                            provenance=provenance))

        input_quantities = material.symbol_quantities_dict

        # Use graph to generate expanded quantity pool
        logger.info("Evaluating graph for %s", item['task_id'])

        new_material = self._graph_evaluator.evaluate(
            material, timeout=self.graph_timeout)

        # Format document and return
        logger.info("Creating doc for %s", item['task_id'])
        # Gives the initial inputs that were used to derive properties of a
        # certain material.

        doc = {
            "inputs": [
                StorageQuantity.from_quantity(q)
                for q in chain.from_iterable(input_quantities.values())
            ]
        }

        for symbol, quantities in new_material.symbol_quantities_dict.items():
            # If no new quantities of a given symbol were derived (i.e. if the initial
            # input quantity/ies is/are the only one/s listed in the new material) then don't add
            # that quantity to the propnet entry document as a derived quantity.
            if len(quantities) == len(input_quantities[symbol]):
                continue
            sub_doc = {}
            try:
                # Write out all quantities as dicts including the
                # internal ID for provenance tracing
                qs = [
                    jsanitize(StorageQuantity.from_quantity(q), strict=True)
                    for q in quantities
                ]
            except AttributeError as ex:
                # Check to see if this is an error caused by an object
                # that is not JSON serializable
                msg = ex.args[0]
                if "object has no attribute 'as_dict'" in msg:
                    # Write error to db and logger
                    errmsg = "Quantity of Symbol '{}' is not ".format(symbol.name) + \
                        "JSON serializable. Cannot write quantities to database!"
                    logger.error(errmsg)
                    sub_doc['error'] = errmsg
                    qs = []
                else:
                    # If not, re-raise the error
                    raise ex
            sub_doc['quantities'] = qs
            doc[symbol.name] = sub_doc

        aggregated_quantities = new_material.get_aggregated_quantities()

        for symbol, quantity in aggregated_quantities.items():
            if symbol.name not in doc:
                # No new quantities were derived
                continue
            # Store mean and std dev for aggregated quantities
            sub_doc = {
                "mean": unumpy.nominal_values(quantity.magnitude).tolist(),
                "std_dev": unumpy.std_devs(quantity.magnitude).tolist(),
                "units":
                quantity.units.format_babel() if quantity.units else None,
                "title": quantity.symbol.display_names[0]
            }
            # Symbol Name -> Sub_Document, listing all Quantities of that type.
            doc[symbol.name].update(sub_doc)

        doc.update({
            "task_id": item["task_id"],
            "pretty_formula": item.get("pretty_formula"),
            "deprecated": item.get("deprecated", False)
        })

        if self.include_sandboxed:
            doc.update({'sbxn': item.get("sbxn", [])})

        return jsanitize(doc, strict=True)

Пример #15

Показать файл

Файл: correlation_with_mp.py Проект: shyshy903/propnet

class CorrelationBuilder(Builder):
    """
    A class to calculate the correlation between properties derived by or used in propnet
    using a suite of regression tools. Uses the Builder architecture for optional parallel
    processing of data.

    Note: serialization of builder does not work with custom correlation functions, although
    interactive use does support them.

    """
    # TODO: Add these symbols to propnet so we don't have to bring them in explicitly?
    MP_QUERY_PROPS = [
        "piezo.eij_max", "elasticity.universal_anisotropy",
        "diel.poly_electronic", "total_magnetization", "efermi",
        "magnetism.total_magnetization_normalized_vol"
    ]
    PROPNET_PROPS = [
        v.name for v in Registry("symbols").values()
        if (v.category == 'property' and v.shape == 1)
    ]

    def __init__(self,
                 propnet_store,
                 mp_store,
                 correlation_store,
                 out_file=None,
                 funcs='linlsq',
                 props=None,
                 **kwargs):
        """
        Constructor for the correlation builder.

        Args:
            propnet_store: (Mongolike Store) store instance pointing to propnet collection
                with read access
            mp_store: (Mongolike Store) store instance pointing to Materials Project collection with read access
            correlation_store: (Mongolike Store) store instance pointing to collection with write access
            out_file: (str) optional, filename to output data in JSON format (useful if using a MemoryStore
                for correlation_store)
            funcs: (str, function, list<str, function>) functions to use for correlation. Built-in functions can
                be specified by the following strings:

                linlsq (default): linear least-squares, reports R^2
                pearson: Pearson r-correlation, reports r
                spearman: Spearman rank correlation, reports r
                mic: maximal-information non-parametric exploration, reports maximal information coefficient
                ransac: random sample consensus (RANSAC) regression, reports score
                theilsen: Theil-Sen regression, reports score
                all: runs all correlation functions above
            **kwargs: arguments to the Builder superclass
        """

        self.propnet_store = propnet_store
        self.mp_store = mp_store
        self.correlation_store = correlation_store
        self.out_file = out_file

        self._correlation_funcs = {
            f.replace('_cfunc_', ''): getattr(self, f)
            for f in dir(self)
            if re.match(r'^_cfunc_.+$', f) and callable(getattr(self, f))
        }

        self._funcs = {}

        if not isinstance(funcs, list):
            funcs = [funcs]

        for f in funcs:
            if isinstance(f, str) and f == 'all':
                self._funcs.update(self._correlation_funcs)
            elif isinstance(f, str) and f in self._correlation_funcs.keys():
                self._funcs[f] = self._correlation_funcs[f]
            elif callable(f):
                name = f.__module__ + "." + f.__name__
                self._funcs[name] = f
            else:
                raise ValueError("Invalid correlation function: {}".format(f))

        if not self._funcs:
            raise ValueError("No valid correlation functions selected")

        mp_prop_map = {(p.split(".")[1] if len(p.split(".")) == 2 else p): p
                       for p in self.MP_QUERY_PROPS}
        self._props = props
        if not props:
            self.mp_query_props = self.MP_QUERY_PROPS
            self.mp_props = list(mp_prop_map.keys())
            self.propnet_props = self.PROPNET_PROPS
        else:
            self.propnet_props = []
            self.mp_props = []
            self.mp_query_props = []
            if isinstance(props, str):
                props = [props]
            for p in props:
                if p in self.PROPNET_PROPS:
                    self.propnet_props.append(p)
                elif p in mp_prop_map.keys():
                    self.mp_props.append(p)
                    self.mp_query_props.append(mp_prop_map[p])

        super(CorrelationBuilder,
              self).__init__(sources=[propnet_store, mp_store],
                             targets=[correlation_store],
                             **kwargs)

    def get_items(self):
        """
        Collects scalar data from propnet and MP databases, aggregates it by property, and creates
        a generator to iterate over all pairs of properties, including pairing of the same property
        with itself for sanity check, and correlation functions.

        Returns: (generator) a generator providing a dictionary with the data for correlation:
            {'x_data': (list<float>) data for independent property (x-axis),
             'x_name': (str) name of independent property,
             'y_data': (list<float>) data for dependent property (y-axis),
             'y_name': (str) name of dependent property,
             'func': (tuple<str, function>) name and function handle for correlation function
             }

        """
        data = defaultdict(dict)

        propnet_data = self.propnet_store.query(
            criteria={},
            properties=[p + '.mean' for p in self.propnet_props] +
            [p + '.units' for p in self.propnet_props] +
            [p + '.quantities'
             for p in self.propnet_props] + ['task_id', 'inputs'])

        for material in propnet_data:
            mpid = material['task_id']

            input_d = defaultdict(list)
            for q in material['inputs']:
                if q['symbol_type'] in self.propnet_props:
                    this_q = ureg.Quantity(q['value'], q['units'])
                    input_d[q['symbol_type']].append(this_q)

            for prop, values in material.items():
                if prop in self.propnet_props:
                    if prop in input_d.keys():
                        for q in values['quantities']:
                            input_d[prop].append(
                                ureg.Quantity(q['value'], q['units']))
                    else:
                        this_q = ureg.Quantity(values['mean'], values['units'])
                        input_d[prop] = [this_q]

            data[mpid].update({k: sum(v) / len(v) for k, v in input_d.items()})

        # TODO: Add these symbols to propnet so we don't have to bring them in explicitly?

        mp_data = self.mp_store.query(criteria={},
                                      properties=self.mp_query_props +
                                      ['task_id'])

        for material in mp_data:
            mpid = material['task_id']
            for prop, value in material.items():
                if isinstance(value, dict):
                    for sub_prop, sub_value in value.items():
                        if prop + '.' + sub_prop in self.mp_query_props and sub_value is not None:
                            data[mpid][sub_prop] = sub_value
                elif prop in self.mp_query_props and value is not None:
                    data[mpid][prop] = value

        # product() produces all possible combinations of properties
        for prop_x, prop_y in product(self.propnet_props + self.mp_props,
                                      repeat=2):
            x = []
            y = []
            for props_data in data.values():
                if prop_x in props_data.keys() and prop_y in props_data.keys():
                    x.append(props_data[prop_x])
                    y.append(props_data[prop_y])

            # MP data does not have units listed in database, so will be floats. propnet
            # data may not have the same units as the MP data, so is stored as pint
            # quantities. Here, the quantities are coerced into the units of MP data
            # as stored in symbols and coverts them to floats.
            if x and any(isinstance(v, ureg.Quantity) for v in x):
                x_float = [
                    xx.to(Registry("symbols")[prop_x].units).magnitude
                    if isinstance(xx, ureg.Quantity) else xx for xx in x
                ]
            else:
                x_float = x
            if y and any(isinstance(v, ureg.Quantity) for v in y):
                y_float = [
                    yy.to(Registry("symbols")[prop_y].units).magnitude
                    if isinstance(yy, ureg.Quantity) else yy for yy in y
                ]
            else:
                y_float = y

            for name, func in self._funcs.items():
                data_dict = {
                    'x_data': x_float,
                    'x_name': prop_x,
                    'y_data': y_float,
                    'y_name': prop_y,
                    'func': (name, func)
                }
                yield data_dict

    def process_item(self, item):
        """
        Run correlation calculation on a pair of properties using the specified function.

        Args:
            item: (dict) input provided by get_items() (see get_items() for structure)

        Returns: (tuple<str, str, float, str, int>) output of calculation with necessary
            information about calculation included. Format in tuple:
                independent property (x-axis) name,
                dependent property (y-axis) name,
                correlation value,
                correlation function name,
                number of data points used for correlation
                length of shortest path between properties on propnet graph where x-axis property
                    is starting property and y-axis property is ending property.
                    Note: if no (forward) connection exists, the path length will be None. This does
                    not preclude y->x having a forward path.

        """
        prop_x, prop_y = item['x_name'], item['y_name']
        data_x, data_y = item['x_data'], item['y_data']
        func_name, func = item['func']
        n_points = len(data_x)

        g = Graph()
        try:
            path_length = g.get_degree_of_separation(prop_x, prop_y)
        except ValueError:
            path_length = None

        if n_points < 2:
            correlation = 0.0
        else:
            correlation = func(data_x, data_y)
        return prop_x, prop_y, correlation, func_name, n_points, path_length

    @staticmethod
    def _cfunc_mic(x, y):
        """
        Get maximal information coefficient for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) maximal information coefficient

        """
        from minepy import MINE
        m = MINE()
        m.compute_score(x, y)
        return m.mic()

    @staticmethod
    def _cfunc_linlsq(x, y):
        """
        Get R^2 value for linear least-squares fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) R^2 value

        """
        from scipy import stats
        fit = stats.linregress(x, y)
        return fit.rvalue**2

    @staticmethod
    def _cfunc_pearson(x, y):
        """
        Get R value for Pearson fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Pearson R value

        """
        from scipy import stats
        fit = stats.pearsonr(x, y)
        return fit[0]

    @staticmethod
    def _cfunc_spearman(x, y):
        """
        Get R value for Spearman fit of a data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Spearman R value

        """
        from scipy import stats
        fit = stats.spearmanr(x, y)
        return fit[0]

    @staticmethod
    def _cfunc_ransac(x, y):
        """
        Get random sample consensus (RANSAC) regression score for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) RANSAC score

        """
        from sklearn.linear_model import RANSACRegressor
        r = RANSACRegressor(random_state=21)
        x_coeff = np.array(x)[:, np.newaxis]
        r.fit(x_coeff, y)
        return r.score(x_coeff, y)

    @staticmethod
    def _cfunc_theilsen(x, y):
        """
        Get Theil-Sen regression score for data set.

        Args:
            x: (list<float>) independent property (x-axis)
            y: (list<float>) dependent property (y-axis)

        Returns: (float) Theil-Sen score

        """
        from sklearn.linear_model import TheilSenRegressor
        r = TheilSenRegressor(random_state=21)
        x_coeff = np.array(x)[:, np.newaxis]
        r.fit(x_coeff, y)
        return r.score(x_coeff, y)

    def update_targets(self, items):
        """
        Write correlation data to Mongo store.

        Args:
            items: (list<dict>) list of results output by process_item()

        """
        data = []
        for item in items:
            prop_x, prop_y, correlation, func_name, n_points, path_length = item
            data.append({
                'property_x': prop_x,
                'property_y': prop_y,
                'correlation': correlation,
                'correlation_func': func_name,
                'n_points': n_points,
                'shortest_path_length': path_length,
                'id': hash((prop_x, prop_y)) ^ hash(func_name)
            })
        self.correlation_store.update(data, key='id')

    def finalize(self, cursor=None):
        """
        Outputs correlation data to JSON file, if specified in instantiation, and runs
        clean-up function for Builder.

        Args:
            cursor: (Mongo Store cursor) optional, cursor to close if not automatically closed.

        """
        if self.out_file:
            try:
                self.write_correlation_data_file(self.out_file)
            except OSError:
                logger.warning(
                    "Cannot open file for writing! Skipping file writing.")

        super(CorrelationBuilder, self).finalize(cursor)

    def write_correlation_data_file(self, out_file):
        """
        Gets data dictionary containing correlation matrices and outputs to a file.

        Args:
            out_file: (str) file path and name for output to JSON file
        """
        matrix = self.get_correlation_matrices()
        with open(out_file, 'w') as f:
            json.dump(matrix, f)

    def get_correlation_matrices(self, func_name=None):
        """
        Builds document containing the correlation matrix with relevant data regarding
        correlation algorithm and properties of the data set.

        Args:
            func_name: (str) optional, name of the correlation functions to include in the document
                default: None, which is to include all that were run by this builder.

        Returns: (dict) document containing correlation data. Format:
            {'properties': (list<str>) names of properties calculated in order of how they are indexed
                    in the matrices
             'n_points': (list<list<int>>) list of lists (i.e. matrix) containing the number of data
                    points evaluated during the fitting procedure
             'correlation': (dict<str: list<list<float>>>) dictionary of matrices containing correlation
                    results, keyed by correlation function name
            }

        """

        prop_data = self.correlation_store.query(
            criteria={'property_x': {
                '$exists': True
            }},
            properties=['property_x'])
        props = list(set(item['property_x'] for item in prop_data))

        out = {
            'properties': props,
            'n_points': None,
            'shortest_path_length': None,
            'correlation': {}
        }

        if not func_name:
            func_name = list(self._funcs.keys())

        if isinstance(func_name, str):
            func_name = [func_name]

        for f in func_name:
            data = self.correlation_store.query(
                criteria={'correlation_func': f})
            corr_matrix: list = np.zeros(shape=(len(props),
                                                len(props))).tolist()

            fill_info_matrices = False
            if not out['n_points'] and not out['shortest_path_length']:
                fill_info_matrices = True
                out['n_points'] = np.zeros(shape=(len(props),
                                                  len(props))).tolist()
                out['shortest_path_length'] = np.zeros(
                    shape=(len(props), len(props))).tolist()

            for d in data:
                prop_x, prop_y, correlation, n_points, path_length = d['property_x'], \
                                                                     d['property_y'], \
                                                                     d['correlation'], \
                                                                     d['n_points'], \
                                                                     d['shortest_path_length']
                ia, ib = props.index(prop_x), props.index(prop_y)
                corr_matrix[ia][ib] = correlation

                if fill_info_matrices:
                    out['n_points'][ia][ib] = n_points
                    out['n_points'][ib][ia] = n_points
                    out['shortest_path_length'][ia][ib] = path_length

            out['correlation'][f] = corr_matrix

        return out

    def as_dict(self):
        """
        Returns the representation of the builder as a dictionary in JSON serializable format.
        Note: because functions are not JSON serializable, custom functions are omitted when
            serializing the object.

        Returns: (dict) representation of this builder as a JSON-serializable dictionary

        """
        d = super(CorrelationBuilder, self).as_dict()
        serialized_funcs = []
        for name in d['funcs'].keys():
            if name in self._correlation_funcs.keys():
                serialized_funcs.append(name)
            else:
                logger.warning(
                    "Cannot serialize custom function '{}'. Omitting.".format(
                        name))

        if not serialized_funcs:
            logger.warning(
                "No functions were able to be serialized from this builder.")

        d['funcs'] = serialized_funcs
        return d

Пример #16

Показать файл

Файл: correlation_with_mp.py Проект: shyshy903/propnet

    def get_items(self):
        """
        Collects scalar data from propnet and MP databases, aggregates it by property, and creates
        a generator to iterate over all pairs of properties, including pairing of the same property
        with itself for sanity check, and correlation functions.

        Returns: (generator) a generator providing a dictionary with the data for correlation:
            {'x_data': (list<float>) data for independent property (x-axis),
             'x_name': (str) name of independent property,
             'y_data': (list<float>) data for dependent property (y-axis),
             'y_name': (str) name of dependent property,
             'func': (tuple<str, function>) name and function handle for correlation function
             }

        """
        data = defaultdict(dict)

        propnet_data = self.propnet_store.query(
            criteria={},
            properties=[p + '.mean' for p in self.propnet_props] +
            [p + '.units' for p in self.propnet_props] +
            [p + '.quantities'
             for p in self.propnet_props] + ['task_id', 'inputs'])

        for material in propnet_data:
            mpid = material['task_id']

            input_d = defaultdict(list)
            for q in material['inputs']:
                if q['symbol_type'] in self.propnet_props:
                    this_q = ureg.Quantity(q['value'], q['units'])
                    input_d[q['symbol_type']].append(this_q)

            for prop, values in material.items():
                if prop in self.propnet_props:
                    if prop in input_d.keys():
                        for q in values['quantities']:
                            input_d[prop].append(
                                ureg.Quantity(q['value'], q['units']))
                    else:
                        this_q = ureg.Quantity(values['mean'], values['units'])
                        input_d[prop] = [this_q]

            data[mpid].update({k: sum(v) / len(v) for k, v in input_d.items()})

        # TODO: Add these symbols to propnet so we don't have to bring them in explicitly?

        mp_data = self.mp_store.query(criteria={},
                                      properties=self.mp_query_props +
                                      ['task_id'])

        for material in mp_data:
            mpid = material['task_id']
            for prop, value in material.items():
                if isinstance(value, dict):
                    for sub_prop, sub_value in value.items():
                        if prop + '.' + sub_prop in self.mp_query_props and sub_value is not None:
                            data[mpid][sub_prop] = sub_value
                elif prop in self.mp_query_props and value is not None:
                    data[mpid][prop] = value

        # product() produces all possible combinations of properties
        for prop_x, prop_y in product(self.propnet_props + self.mp_props,
                                      repeat=2):
            x = []
            y = []
            for props_data in data.values():
                if prop_x in props_data.keys() and prop_y in props_data.keys():
                    x.append(props_data[prop_x])
                    y.append(props_data[prop_y])

            # MP data does not have units listed in database, so will be floats. propnet
            # data may not have the same units as the MP data, so is stored as pint
            # quantities. Here, the quantities are coerced into the units of MP data
            # as stored in symbols and coverts them to floats.
            if x and any(isinstance(v, ureg.Quantity) for v in x):
                x_float = [
                    xx.to(Registry("symbols")[prop_x].units).magnitude
                    if isinstance(xx, ureg.Quantity) else xx for xx in x
                ]
            else:
                x_float = x
            if y and any(isinstance(v, ureg.Quantity) for v in y):
                y_float = [
                    yy.to(Registry("symbols")[prop_y].units).magnitude
                    if isinstance(yy, ureg.Quantity) else yy for yy in y
                ]
            else:
                y_float = y

            for name, func in self._funcs.items():
                data_dict = {
                    'x_data': x_float,
                    'x_name': prop_x,
                    'y_data': y_float,
                    'y_name': prop_y,
                    'func': (name, func)
                }
                yield data_dict