def populate_point_information(prop_x, prop_y, current_func): if not (prop_x and prop_y): raise PreventUpdate prop_x_name = Registry("symbols")[prop_x].display_names[0] prop_y_name = Registry("symbols")[prop_y].display_names[0] data = list( store.query(criteria={ 'property_x': prop_x, 'property_y': prop_y })) path_length = data[0]['shortest_path_length'] if path_length is None: path_text = "not connected" elif path_length == 0: path_text = "properties are the same" else: path_text = f"separated by {path_length} model" if path_length > 1: path_text += "s" point_text = dcc.Markdown(f""" ##### Point information **x-axis property:** {prop_x_name} **y-axis property:** {prop_y_name} **distance apart on graph:** {path_text} **number of data points:** {data[0]['n_points']} """) # This ensures we know the ordering of the rows correlation_data = { d['correlation_func']: { 'Correlation Function': correlation_func_info[d['correlation_func']]["name"], 'Correlation Value': f"{d['correlation']:0.5f}" } for d in data } correlation_data = [ correlation_data[func] for func in correlation_funcs ] correlation_table = dt.DataTable( id='corr-table', data=correlation_data, columns=[{ 'id': val, 'name': val } for val in ('Correlation Function', 'Correlation Value')], editable=False, style_data_conditional=[{ 'if': { 'row_index': correlation_funcs.index(current_func) }, "backgroundColor": "#3D9970", 'color': 'white' }], style_cell={ 'font-family': 'HelveticaNeue', 'text-align': 'left' }, style_header={ 'fontWeight': 'bold', 'font-family': 'HelveticaNeue', 'text-align': 'left' }) link_to_plot = dcc.Link("View the data plot", href=f'/plot?x={prop_x}&y={prop_y}') return [point_text, correlation_table, link_to_plot], True
def tearDownClass(cls): non_builtin_syms = [k for k, v in Registry("symbols").items() if not v.is_builtin] for sym in non_builtin_syms: Registry("symbols").pop(sym)
def setUpClass(cls): add_builtin_models_to_registry() # Inspiration was taken from the GraphTest class # I tried to construct the dictionaries for comparison # without writing out every one explicitly by reusing # information where it was applicable. # If this is too unreadable, can change to writing it # out explicitly in a JSON file and importing it. Would # still need to replace some fields dynamically. symbols = StorageTest.generate_symbols() cls.custom_syms_as_dicts = { k: {'@module': 'propnet.core.symbols', '@class': 'Symbol', 'name': k, 'display_names': [k], 'display_symbols': [k], 'units': (1, ()), 'shape': 1, 'object_type': None, 'comment': None, 'category': 'property', 'constraint': None, 'default_value': None, 'is_builtin': False} for k in ['A', 'B', 'C'] } cls.custom_syms_as_dicts['C'].update( {"units": None, "shape": None, "object_type": "str", "category": "object"}) cls.custom_symbols_json = copy.deepcopy(cls.custom_syms_as_dicts) for k in ['A', 'B']: cls.custom_symbols_json[k]['units'] = [1, []] a = [QuantityFactory.create_quantity(symbols['A'], 19), QuantityFactory.create_quantity(symbols['A'], 23)] b = [QuantityFactory.create_quantity(symbols['B'], 38, provenance=ProvenanceElement(model='model1', inputs=[a[0]])), QuantityFactory.create_quantity(symbols['B'], 46, provenance=ProvenanceElement(model='model1', inputs=[a[1]]))] cls.quantities_custom_symbol = {"A": a, "B": b} cls.sq_custom_sym_as_dicts = { k: [{'@module': 'propnet.dbtools.storage', '@class': 'StorageQuantity', 'internal_id': vv._internal_id, 'data_type': 'NumQuantity', 'symbol_type': symbols[k], 'value': vv.magnitude, 'units': 'dimensionless', 'provenance': ProvenanceStore.from_provenance_element(vv.provenance), 'tags': [], 'uncertainty': None} for vv in v] for k, v in cls.quantities_custom_symbol.items() } provenances_json = { "A": [{'@module': 'propnet.dbtools.storage', '@class': 'ProvenanceStore', 'model': None, 'inputs': None, 'source': aa.provenance.source} for aa in a]} provenances_json['B'] = [ {'@module': 'propnet.dbtools.storage', '@class': 'ProvenanceStore', 'model': 'model1', 'inputs': [{'@module': 'propnet.dbtools.storage', '@class': 'ProvenanceStoreQuantity', 'data_type': 'NumQuantity', 'symbol_type': cls.custom_symbols_json['A'], 'internal_id': q.provenance.inputs[0]._internal_id, 'tags': [], 'provenance': p}], 'source': q.provenance.source} for q, p in zip(b, provenances_json['A'])] cls.sq_custom_sym_json = copy.deepcopy(cls.sq_custom_sym_as_dicts) for sym in ['A', 'B']: for q, p in zip(cls.sq_custom_sym_json[sym], provenances_json[sym]): q['symbol_type'] = cls.custom_symbols_json[sym] q['provenance'] = p band_gaps = [QuantityFactory.create_quantity('band_gap', 3.3, 'eV'), QuantityFactory.create_quantity('band_gap', 2.1, 'eV')] bg_ri_model = Registry("models")['band_gap_refractive_index_moss'] refractive_indices = [bg_ri_model.evaluate({"Eg": bg}).pop('refractive_index') for bg in band_gaps] cls.quantities_canonical_symbol = {"band_gaps": band_gaps, "refractive_indices": refractive_indices} cls.sq_canonical_sym_as_dicts_no_value = copy.deepcopy(cls.sq_custom_sym_as_dicts) cls.sq_canonical_sym_as_dicts_no_value['band_gaps'] = cls.sq_canonical_sym_as_dicts_no_value.pop('A') cls.sq_canonical_sym_as_dicts_no_value['refractive_indices'] = cls.sq_canonical_sym_as_dicts_no_value.pop('B') for d, sq in zip(cls.sq_canonical_sym_as_dicts_no_value['band_gaps'], band_gaps): d.update({ "internal_id": sq._internal_id, "symbol_type": "band_gap", "units": "electron_volt", "provenance": ProvenanceStore.from_provenance_element(sq.provenance) }) d.pop('value') for d, sq in zip(cls.sq_canonical_sym_as_dicts_no_value['refractive_indices'], refractive_indices): d.update({ "internal_id": sq._internal_id, "symbol_type": "refractive_index", "units": "dimensionless", "provenance": ProvenanceStore.from_provenance_element(sq.provenance) }) d.pop('value') cls.sq_canonical_sym_values = {"band_gaps": [3.3, 2.1], "refractive_indices": [2.316340583741216, 2.593439239956374]} provenances_json['band_gaps'] = [ {'@module': 'propnet.dbtools.storage', '@class': 'ProvenanceStore', 'model': None, 'inputs': None, 'source': bg.provenance.source} for bg in band_gaps ] provenances_json['refractive_indices'] = [{ '@module': 'propnet.dbtools.storage', '@class': 'ProvenanceStore', 'model': 'band_gap_refractive_index_moss', 'inputs': [{'@module': 'propnet.dbtools.storage', '@class': 'ProvenanceStoreQuantity', 'data_type': 'NumQuantity', 'symbol_type': 'band_gap', 'internal_id': bg._internal_id, 'tags': [], 'provenance': pj}], 'source': ri.provenance.source} for bg, pj, ri in zip(band_gaps, provenances_json['band_gaps'], refractive_indices) ] cls.sq_canonical_sym_json_no_value = copy.deepcopy(cls.sq_canonical_sym_as_dicts_no_value) for sym in ["band_gaps", "refractive_indices"]: for q, p in zip(cls.sq_canonical_sym_json_no_value[sym], provenances_json[sym]): q['provenance'] = p cls.quantity_with_uncertainty = NumQuantity.from_weighted_mean(b) cls.sq_with_uncertainty_as_dict_no_numbers = { '@module': 'propnet.dbtools.storage', '@class': 'StorageQuantity', 'internal_id': cls.quantity_with_uncertainty._internal_id, 'data_type': 'NumQuantity', 'symbol_type': symbols['B'], 'units': 'dimensionless', 'provenance': ProvenanceStore.from_provenance_element( cls.quantity_with_uncertainty.provenance), 'tags': []} provenances_json = { '@module': 'propnet.dbtools.storage', '@class': 'ProvenanceStore', 'model': 'aggregation', 'inputs': [ {'@module': 'propnet.dbtools.storage', '@class': 'ProvenanceStoreQuantity', 'data_type': 'NumQuantity', 'symbol_type': cls.custom_symbols_json['B'], 'internal_id': b['internal_id'], 'tags': [], 'provenance': b['provenance']} for b in cls.sq_custom_sym_json['B']], 'source': cls.quantity_with_uncertainty.provenance.source } cls.sq_with_uncertainty_json_no_numbers = copy.deepcopy(cls.sq_with_uncertainty_as_dict_no_numbers) cls.sq_with_uncertainty_json_no_numbers.update({"symbol_type": cls.custom_symbols_json['B'], "provenance": provenances_json}) cls.sq_with_uncertainty_numbers = {"value": 42.0, "uncertainty": 4.0} obj_symbol = symbols['C'] cls.object_quantity = QuantityFactory.create_quantity(obj_symbol, "Test string") cls.sq_object_as_dict = copy.deepcopy(cls.sq_custom_sym_as_dicts['A'][0]) cls.sq_object_as_dict.update({ "data_type": "ObjQuantity", "symbol_type": symbols['C'], "internal_id": cls.object_quantity._internal_id, "value": "Test string", "units": None, "provenance": ProvenanceStore.from_provenance_element(cls.object_quantity.provenance) }) cls.sq_object_json = copy.deepcopy(cls.sq_object_as_dict) cls.sq_object_json.update( {"symbol_type": cls.custom_syms_as_dicts['C'], "provenance": {'@module': 'propnet.dbtools.storage', '@class': 'ProvenanceStore', 'model': None, 'inputs': None, 'source': cls.object_quantity.provenance.source}} ) # This setting allows dict differences to be shown in full cls.maxDiff = None
import logging logger = logging.getLogger(__name__) mpr = MPRester() try: store = loadfn(environ["PROPNET_STORE_FILE"]) store.connect() except (ServerSelectionTimeoutError, KeyError): from maggma.stores import MemoryStore store = MemoryStore() store.connect() # layout won't work if database is down, but at least web app will stay up scalar_symbols = {k: v for k, v in Registry("symbols").items() if (v.category == 'property' and v.shape == 1)} warning_layout = html.Div('No database connection could be established.', style={'font-family': 'monospace', 'color': 'rgb(211, 84, 0)', 'text-align': 'left', 'font-size': '1.2em'}) else: cut_off = 100 # need at least this many available quantities for plot """ scalar_symbols = {k: v for k, v in Registry("symbols").items() if (v.category == 'property' and v.shape == 1 and store.query( criteria={f'{k}.mean': {'$exists': True}}).count() > cut_off)} """ scalar_symbols = {
def _update_globals(): for name, model in Registry("models").items(): if model.is_builtin: globals()[name] = model
def tearDownClass(cls): Registry.clear_all_registries()
def setUpClass(cls): Registry.clear_all_registries()
from propnet.ext.matproj import MPRester from propnet.ext.aflow import AflowAdapter MPR = MPRester() AFA = AflowAdapter() graph_evaluator = Graph(parallel=True, max_workers=4) # explicitly making this an OrderedDict so we can go back from the # display name to the symbol name # Removed condition symbols from table until we can handle combinatorics blow-up that results # from adding a temperature -cml # TODO: Add condition symbols back when combinartorics problem solved SCALAR_SYMBOLS = OrderedDict({ k: v for k, v in sorted(Registry("symbols").items(), key=lambda x: x[1].display_names[0]) if (v.category == 'property' and v.shape == 1) }) ROW_IDX_TO_SYMBOL_NAME = [symbol for symbol in SCALAR_SYMBOLS.keys()] DEFAULT_ROWS = [{ 'Property': symbol.display_names[0], 'Editable Value': "" } for symbol in SCALAR_SYMBOLS.values()] REMAINING_SYMBOLS = OrderedDict({ k: v for k, v in sorted(Registry("symbols").items(), key=lambda x: x[1].display_names[0]) if not ((v.category == 'property' or v.category == 'condition')
def evaluate(input_rows, data, aggregate): quantities = [] for idx, row in enumerate(input_rows): if row['Editable Value']: try: value = ureg.parse_expression(row['Editable Value']) units = Registry("units").get(ROW_IDX_TO_SYMBOL_NAME[idx]) value.ito(units) except Exception: # Someone put an invalid value in the table # TODO: Make error known to the user raise PreventUpdate q = QuantityFactory.create_quantity( symbol_type=ROW_IDX_TO_SYMBOL_NAME[idx], value=value) quantities.append(q) if data and len(data) > 0: quantities += json.loads(data, cls=MontyDecoder).values() if not quantities: raise PreventUpdate material = Material() for quantity in quantities: material.add_quantity(quantity) output_material = graph_evaluator.evaluate(material, timeout=5) if aggregate: aggregated_quantities = output_material.get_aggregated_quantities() non_aggregatable_quantities = [ v for v in output_material.get_quantities() if v.symbol not in aggregated_quantities ] output_quantities = list( aggregated_quantities.values()) + non_aggregatable_quantities else: output_quantities = output_material.get_quantities() output_rows = [{ 'Property': quantity.symbol.display_names[0], 'Value': quantity.pretty_string(sigfigs=3) } for quantity in output_quantities] output_table = dt.DataTable(id='output-table', data=output_rows, columns=[{ 'id': val, 'name': val } for val in ('Property', 'Value')], editable=False, **DATA_TABLE_STYLE) # TODO: clean up input_quantity_names = [q.symbol for q in quantities] derived_quantity_names = \ set([q.symbol for q in output_quantities]) - \ set(input_quantity_names) models_evaluated = set( output_q.provenance.model for output_q in output_material.get_quantities()) models_evaluated = [ Registry("models").get(m) for m in models_evaluated if Registry("models").get(m) is not None ] material_graph_data = graph_conversion( propnet_nx_graph, derivation_pathway={ 'inputs': input_quantity_names, 'outputs': list(derived_quantity_names), 'models': models_evaluated }) output_graph = html.Div(children=[ dcc.Checklist(id='material-graph-options', options=[{ 'label': 'Show models', 'value': 'show_models' }, { 'label': 'Show properties', 'value': 'show_properties' }], value=['show_properties'], labelStyle={'display': 'inline-block'}), Cytoscape(id='material-graph', elements=material_graph_data, stylesheet=GRAPH_STYLESHEET, layout=GRAPH_LAYOUT_CONFIG, **GRAPH_SETTINGS['full_view']) ]) return [output_graph, html.Br(), output_table]
def get_data_from_full_db(self, prop_x, prop_y): """ Collects scalar data from full propnet database, aggregates it by property, and samples it if desired. Args: prop_x (str): name of property x prop_y (str): name of property y Returns: dict: dictionary of data keyed by property name """ # Get all materials which have both properties in the inputs or outputs criteria = { '$and': [{ '$or': [{ 'inputs.symbol_type': prop_x }, { prop_x: { '$exists': True } }] }, { '$or': [{ 'inputs.symbol_type': prop_y }, { prop_y: { '$exists': True } }] }] } properties = [prop_x + '.quantities', prop_y + '.quantities', 'inputs'] if self.sample_size is None: pn_data = self.propnet_store.query(criteria=criteria, properties=properties) else: pipeline = [ { '$match': criteria }, { '$sample': { 'size': self.sample_size } }, { '$project': {p: True for p in properties} }, ] pn_data = self.propnet_store.collection.aggregate( pipeline, allowDiskUse=True) x_unit = Registry("units")[prop_x] y_unit = Registry("units")[prop_y] data = defaultdict(list) for material in pn_data: # Collect all data with units for this material # and calculate the mean, convert units, store magnitude of mean if prop_x == prop_y: # This is to avoid duplicating the work and the data props = (prop_x, ) units = (x_unit, ) else: props = (prop_x, prop_y) units = (x_unit, y_unit) for prop, unit in zip(props, units): qs = [ ureg.Quantity(q['value'], q['units']) for q in material['inputs'] if q['symbol_type'] == prop ] if prop in material: qs.extend([ ureg.Quantity(q['value'], q['units']) for q in material[prop]['quantities'] ]) if len(qs) == 0: raise ValueError("Query for property {} gave no results" "".format(prop)) prop_mean = sum(qs) / len(qs) data[prop].append(prop_mean.to(unit).magnitude) return data
def tearDownClass(cls): warnings.filterwarnings("default", category=UnitStrippedWarning) non_builtin_syms = [k for k, v in Registry("symbols").items() if not v.is_builtin] for sym in non_builtin_syms: Registry("symbols").pop(sym)
class CorrelationBuilder(Builder): """ A class to calculate the correlation between properties derived by or used in propnet using a suite of regression tools. Uses the Builder architecture for optional parallel processing of data. Note: serialization of builder does not work with custom correlation functions, although interactive use does support them. """ PROPNET_PROPS = [ v.name for v in Registry("symbols").values() if (v.category == 'property' and v.shape == 1) ] def __init__(self, propnet_store, correlation_store, out_file=None, funcs='linlsq', props=None, sample_size=None, from_quantity_db=True, **kwargs): """ Constructor for the correlation builder. Args: propnet_store (Mongolike Store): store instance pointing to propnet collection with read access correlation_store (Mongolike Store): store instance pointing to collection with write access out_file (str): optional, filename to output data in JSON format (useful if using a MemoryStore for correlation_store) funcs (`str`, `callable`, list of `str` or `callable`) functions to use for correlation. Built-in functions can be specified by the following strings: linlsq (default): linear least-squares, reports R^2 pearson: Pearson r-correlation, reports r spearman: Spearman rank correlation, reports r mic: maximal-information non-parametric exploration, reports maximal information coefficient ransac: random sample consensus (RANSAC) regression, reports score theilsen: Theil-Sen regression, reports score all: runs all correlation functions above props (`list` of `str`): optional, list of properties for which to calculate the correlation. Default is to calculate for all possible pairs (props=None) sample_size (int): optional, limits correlation calculation data to a random sample of size `sample_size`. Default: None (no limit) from_quantity_db (bool): True means propnet_store follows the quantity-indexed database schema, False means the full, material-indexed database schema. Note: querying quantity-indexed databases is considerably faster than material-indexed. Default: True (quantity schema) **kwargs: arguments to the Builder superclass """ self.propnet_store = propnet_store self.from_quantity_db = from_quantity_db self.correlation_store = correlation_store self.out_file = out_file self._correlation_funcs = self.get_correlation_funcs() self._funcs = {} if not isinstance(funcs, list): funcs = [funcs] for f in funcs: if isinstance(f, str) and f == 'all': self._funcs.update(self._correlation_funcs) elif isinstance(f, str) and f in self._correlation_funcs.keys(): self._funcs[f] = self._correlation_funcs[f] elif callable(f): name = f.__module__ + "." + f.__name__ self._funcs[name] = f else: raise ValueError("Invalid correlation function: {}".format(f)) if not self._funcs: raise ValueError("No valid correlation functions selected") self._props = props or self.PROPNET_PROPS if sample_size is not None and sample_size < 2: raise ValueError("Sample size must be greater than 1") self.sample_size = sample_size self.total = None super(CorrelationBuilder, self).__init__(sources=[propnet_store], targets=[correlation_store], **kwargs) @classmethod def get_correlation_funcs(cls): """ Gets built-in correlation functions and their names. Returns: dict: dict of function handles keyed by name """ return { f.replace('_cfunc_', ''): getattr(cls, f) for f in dir(cls) if re.match(r'^_cfunc_.+$', f) and callable(getattr(cls, f)) } def get_items(self): """ Accumulates data and generates data sets for pairs of properties coupled with correlation functions. Returns: (generator): yields dicts of data (see _make_data_combinations()) """ self.total = len(self._props)**2 * len(self._funcs) # combinations_with_replacement() produces all possible pairs of properties # without repeating, i.e. will give AB but not BA. Code below manually # produces "BA" so that we don't have to re-query the database. for prop_x, prop_y in combinations_with_replacement(self._props, 2): if self.from_quantity_db: data = self.get_data_from_quantity_db( self.propnet_store, prop_x, prop_y, sample_size=self.sample_size) else: data = self.get_data_from_full_db(prop_x, prop_y) yield from self._make_data_combinations(prop_x, prop_y, data) @staticmethod def get_data_from_quantity_db(store, *props, sample_size=None, include_id=False): """ Collects scalar data from the quantity-onlu propnet database, aggregates it by material and property, and samples it if desired. Args: store (maggma.stores.Store): MongoDB store instance for quantity databse *props (str): property names as strings sample_size (int): If specified, limits the number of returned records to sample_size, randomly selected. If total of records is less than sample_size, only those records are returned. Default: None (all records) include_id (bool): True includes the '_id' field, which contains the material key for the record. Default: False (do not include the field) Returns: dict: dictionary of data keyed by property name """ # This aggregation query collects the quantities, groups them by material # and averages the values for that material, then samples them (if specified) match_stage = { '$match': { '$or': [{ 'symbol_type': prop } for prop in props] } } group_stage = {'$group': {'_id': '$material_key'}} for prop in props: group_stage['$group'].update({ prop: { '$avg': { '$cond': [{ "$eq": ['$symbol_type', prop] }, '$value', None] } } }) pipeline = [match_stage, group_stage] if sample_size is not None: pipeline.append({'$sample': {'size': sample_size}}) query = store.collection.aggregate(pipeline=pipeline, allowDiskUse=True) data = defaultdict(list) for m in query: if all(m[prop] is not None and np.isfinite(m[prop]) for prop in props): for prop in props: data[prop].append(m[prop]) if include_id: data['_id'].append(m['_id']) return dict(data) def get_data_from_full_db(self, prop_x, prop_y): """ Collects scalar data from full propnet database, aggregates it by property, and samples it if desired. Args: prop_x (str): name of property x prop_y (str): name of property y Returns: dict: dictionary of data keyed by property name """ # Get all materials which have both properties in the inputs or outputs criteria = { '$and': [{ '$or': [{ 'inputs.symbol_type': prop_x }, { prop_x: { '$exists': True } }] }, { '$or': [{ 'inputs.symbol_type': prop_y }, { prop_y: { '$exists': True } }] }] } properties = [prop_x + '.quantities', prop_y + '.quantities', 'inputs'] if self.sample_size is None: pn_data = self.propnet_store.query(criteria=criteria, properties=properties) else: pipeline = [ { '$match': criteria }, { '$sample': { 'size': self.sample_size } }, { '$project': {p: True for p in properties} }, ] pn_data = self.propnet_store.collection.aggregate( pipeline, allowDiskUse=True) x_unit = Registry("units")[prop_x] y_unit = Registry("units")[prop_y] data = defaultdict(list) for material in pn_data: # Collect all data with units for this material # and calculate the mean, convert units, store magnitude of mean if prop_x == prop_y: # This is to avoid duplicating the work and the data props = (prop_x, ) units = (x_unit, ) else: props = (prop_x, prop_y) units = (x_unit, y_unit) for prop, unit in zip(props, units): qs = [ ureg.Quantity(q['value'], q['units']) for q in material['inputs'] if q['symbol_type'] == prop ] if prop in material: qs.extend([ ureg.Quantity(q['value'], q['units']) for q in material[prop]['quantities'] ]) if len(qs) == 0: raise ValueError("Query for property {} gave no results" "".format(prop)) prop_mean = sum(qs) / len(qs) data[prop].append(prop_mean.to(unit).magnitude) return data def _make_data_combinations(self, prop_x, prop_y, data): """ Generates combinations of properties and desired correlation functions for evaluation. Args: prop_x (str): name of property x prop_y (str): name of property y data (dict): dictionary of data keyed by property name Returns: (generator) a generator providing a dictionary with the data for correlation: {'x_data': (list<float>) data for independent property (x-axis), 'x_name': (str) name of independent property, 'y_data': (list<float>) data for dependent property (y-axis), 'y_name': (str) name of dependent property, 'func': (tuple<str, function>) name and function handle for correlation function } """ # So we get AB and BA without re-querying, but not two AA if prop_x == prop_y: prop_combos = ((prop_x, prop_x), ) else: prop_combos = ((prop_x, prop_y), (prop_y, prop_x)) for x, y in prop_combos: for name, func in self._funcs.items(): data_dict = { 'x_data': data.get(x, []), 'x_name': x, 'y_data': data.get(y, []), 'y_name': y, 'func': (name, func) } yield data_dict def process_item(self, item): """ Run correlation calculation on a pair of properties using the specified function. Args: item: (dict) input provided by get_items() (see get_items() for structure) Returns: (tuple<str, str, float, str, int>) output of calculation with necessary information about calculation included. Format in tuple: independent property (x-axis) name, dependent property (y-axis) name, correlation value, correlation function name, number of data points used for correlation length of shortest path between properties on propnet graph where x-axis property is starting property and y-axis property is ending property. Note: if no (forward) connection exists, the path length will be None. This does not preclude y->x having a forward path. """ prop_x, prop_y = item['x_name'], item['y_name'] data_x, data_y = item['x_data'], item['y_data'] func_name, func = item['func'] n_points = len(data_x) g = Graph() try: path_length_xy = g.get_degree_of_separation(prop_x, prop_y) path_length_yx = g.get_degree_of_separation(prop_y, prop_x) except ValueError: # This shouldn't happen...but just in case path_length_xy = None path_length_yx = None try: path_length = min(path_length_xy, path_length_yx) except TypeError: path_length = path_length_xy or path_length_yx if n_points < 2: result = 0.0 else: try: result = func(data_x, data_y) except Exception as ex: # If correlation fails, catch the error, save it, and move on result = ex return prop_x, prop_y, result, func_name, n_points, path_length @staticmethod def _cfunc_mic(x, y): """ Get maximal information coefficient for data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) maximal information coefficient """ from minepy import MINE m = MINE() m.compute_score(x, y) return m.mic() @staticmethod def _cfunc_linlsq(x, y): """ Get R^2 value for linear least-squares fit of a data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) R^2 value """ from scipy import stats fit = stats.linregress(x, y) return fit.rvalue**2 @staticmethod def _cfunc_pearson(x, y): """ Get R value for Pearson fit of a data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) Pearson R value """ from scipy import stats fit = stats.pearsonr(x, y) return fit[0] @staticmethod def _cfunc_spearman(x, y): """ Get R value for Spearman fit of a data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) Spearman R value """ from scipy import stats fit = stats.spearmanr(x, y) return fit[0] @staticmethod def _cfunc_ransac(x, y): """ Get random sample consensus (RANSAC) regression score for data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) RANSAC score """ from sklearn.linear_model import RANSACRegressor r = RANSACRegressor(random_state=21) x_coeff = np.array(x)[:, np.newaxis] r.fit(x_coeff, y) return r.score(x_coeff, y) @staticmethod def _cfunc_theilsen(x, y): """ Get Theil-Sen regression score for data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) Theil-Sen score """ from sklearn.linear_model import TheilSenRegressor r = TheilSenRegressor(random_state=21) x_coeff = np.array(x)[:, np.newaxis] r.fit(x_coeff, y) return r.score(x_coeff, y) def update_targets(self, items): """ Write correlation data to Mongo store. Args: items: (list<dict>) list of results output by process_item() """ data = [] for item in items: prop_x, prop_y, result, func_name, n_points, path_length = item d = { 'property_x': prop_x, 'property_y': prop_y, 'correlation_func': func_name, 'n_points': n_points, 'shortest_path_length': path_length, 'id': hash((prop_x, prop_y)) ^ hash(func_name) } if not isinstance(result, Exception): d['correlation'] = result else: d['correlation'] = None d['error'] = (result.__class__.__name__, result.args) data.append(d) self.correlation_store.update(data, key='id') def finalize(self, cursor=None): """ Outputs correlation data to JSON file, if specified in instantiation, and runs clean-up function for Builder. Args: cursor: (Mongo Store cursor) optional, cursor to close if not automatically closed. """ props_to_index = [ 'property_x', 'property_y', 'correlation_func', 'correlation', 'shortest_path_length' ] for prop in props_to_index: if not self.correlation_store.ensure_index(prop): logger.warning( "Could not add index for property {}".format(prop)) if self.out_file: try: self.write_correlation_data_file(self.out_file) except OSError: logger.warning( "Cannot open file for writing! Skipping file writing.") super(CorrelationBuilder, self).finalize(cursor) def write_correlation_data_file(self, out_file): """ Gets data dictionary containing correlation matrices and outputs to a file. Args: out_file: (str) file path and name for output to JSON file """ matrix = self.get_correlation_matrices() with open(out_file, 'w') as f: json.dump(matrix, f) def get_correlation_matrices(self, func_name=None): """ Builds document containing the correlation matrix with relevant data regarding correlation algorithm and properties of the data set. Args: func_name: (str) optional, name of the correlation functions to include in the document default: None, which is to include all that were run by this builder. Returns: (dict) document containing correlation data. Format: {'properties': (list<str>) names of properties calculated in order of how they are indexed in the matrices 'n_points': (list<list<int>>) list of lists (i.e. matrix) containing the number of data points evaluated during the fitting procedure 'correlation': (dict<str: list<list<float>>>) dictionary of matrices containing correlation results, keyed by correlation function name } """ prop_data = self.correlation_store.query( criteria={'property_x': { '$exists': True }}, properties=['property_x']) props = list(set(item['property_x'] for item in prop_data)) out = { 'properties': props, 'n_points': None, 'shortest_path_length': None, 'correlation': {} } if not func_name: func_name = list(self._funcs.keys()) if isinstance(func_name, str): func_name = [func_name] for f in func_name: data = self.correlation_store.query( criteria={'correlation_func': f}) corr_matrix: list = np.zeros(shape=(len(props), len(props))).tolist() fill_info_matrices = False if not out['n_points'] and not out['shortest_path_length']: fill_info_matrices = True out['n_points'] = np.zeros(shape=(len(props), len(props))).tolist() out['shortest_path_length'] = np.zeros( shape=(len(props), len(props))).tolist() for d in data: prop_x, prop_y, correlation, n_points, path_length = d['property_x'], \ d['property_y'], \ d['correlation'], \ d['n_points'], \ d['shortest_path_length'] ia, ib = props.index(prop_x), props.index(prop_y) corr_matrix[ia][ib] = correlation if fill_info_matrices: out['n_points'][ia][ib] = n_points out['n_points'][ib][ia] = n_points out['shortest_path_length'][ia][ib] = path_length out['correlation'][f] = corr_matrix return out def as_dict(self): """ Returns the representation of the builder as a dictionary in JSON serializable format. Note: because functions are not JSON serializable, custom functions are omitted when serializing the object. Returns: (dict) representation of this builder as a JSON-serializable dictionary """ d = super(CorrelationBuilder, self).as_dict() serialized_funcs = [] for name in d['funcs'].keys(): if name in self._correlation_funcs.keys(): serialized_funcs.append(name) else: logger.warning( "Cannot serialize custom function '{}'. Omitting.".format( name)) if not serialized_funcs: logger.warning( "No functions were able to be serialized from this builder.") d['funcs'] = serialized_funcs return d
def setUpClass(cls) -> None: Registry.clear_all_registries() add_builtin_symbols_to_registry()
def process(self, item): if self.graph_parallel and not self.allow_child_process and \ current_process().name != "MainProcess": logger.warning( "It appears derive_quantities() is running " "in a child process, possibly in a parallelized " "Runner.\nThis is not recommended and will deteriorate " "performance.") # Define quantities corresponding to materials doc fields # Attach quantities to materials item = MontyDecoder().process_decoded(item) logger.info("Populating material for %s", item['task_id']) material = Material() if 'created_at' in item.keys(): date_created = item['created_at'] else: date_created = None provenance = ProvenanceElement( source={ "source": self.source_name, "source_key": item['task_id'], "date_created": date_created }) for mkey, property_name in self.materials_symbol_map.items(): value = pydash.get(item, mkey) if value: material.add_quantity( QuantityFactory.create_quantity( property_name, value, units=Registry("units").get(property_name, None), provenance=provenance)) # Add custom things, e. g. computed entry computed_entry = get_entry(item) if computed_entry: material.add_quantity( QuantityFactory.create_quantity("computed_entry", computed_entry, provenance=provenance)) else: logger.info("Unable to create computed entry for {}".format( item['task_id'])) material.add_quantity( QuantityFactory.create_quantity("external_identifier_mp", item['task_id'], provenance=provenance)) input_quantities = material.symbol_quantities_dict # Use graph to generate expanded quantity pool logger.info("Evaluating graph for %s", item['task_id']) new_material = self._graph_evaluator.evaluate( material, timeout=self.graph_timeout) # Format document and return logger.info("Creating doc for %s", item['task_id']) # Gives the initial inputs that were used to derive properties of a # certain material. doc = { "inputs": [ StorageQuantity.from_quantity(q) for q in chain.from_iterable(input_quantities.values()) ] } for symbol, quantities in new_material.symbol_quantities_dict.items(): # If no new quantities of a given symbol were derived (i.e. if the initial # input quantity/ies is/are the only one/s listed in the new material) then don't add # that quantity to the propnet entry document as a derived quantity. if len(quantities) == len(input_quantities[symbol]): continue sub_doc = {} try: # Write out all quantities as dicts including the # internal ID for provenance tracing qs = [ jsanitize(StorageQuantity.from_quantity(q), strict=True) for q in quantities ] except AttributeError as ex: # Check to see if this is an error caused by an object # that is not JSON serializable msg = ex.args[0] if "object has no attribute 'as_dict'" in msg: # Write error to db and logger errmsg = "Quantity of Symbol '{}' is not ".format(symbol.name) + \ "JSON serializable. Cannot write quantities to database!" logger.error(errmsg) sub_doc['error'] = errmsg qs = [] else: # If not, re-raise the error raise ex sub_doc['quantities'] = qs doc[symbol.name] = sub_doc aggregated_quantities = new_material.get_aggregated_quantities() for symbol, quantity in aggregated_quantities.items(): if symbol.name not in doc: # No new quantities were derived continue # Store mean and std dev for aggregated quantities sub_doc = { "mean": unumpy.nominal_values(quantity.magnitude).tolist(), "std_dev": unumpy.std_devs(quantity.magnitude).tolist(), "units": quantity.units.format_babel() if quantity.units else None, "title": quantity.symbol.display_names[0] } # Symbol Name -> Sub_Document, listing all Quantities of that type. doc[symbol.name].update(sub_doc) doc.update({ "task_id": item["task_id"], "pretty_formula": item.get("pretty_formula"), "deprecated": item.get("deprecated", False) }) if self.include_sandboxed: doc.update({'sbxn': item.get("sbxn", [])}) return jsanitize(doc, strict=True)
class CorrelationBuilder(Builder): """ A class to calculate the correlation between properties derived by or used in propnet using a suite of regression tools. Uses the Builder architecture for optional parallel processing of data. Note: serialization of builder does not work with custom correlation functions, although interactive use does support them. """ # TODO: Add these symbols to propnet so we don't have to bring them in explicitly? MP_QUERY_PROPS = [ "piezo.eij_max", "elasticity.universal_anisotropy", "diel.poly_electronic", "total_magnetization", "efermi", "magnetism.total_magnetization_normalized_vol" ] PROPNET_PROPS = [ v.name for v in Registry("symbols").values() if (v.category == 'property' and v.shape == 1) ] def __init__(self, propnet_store, mp_store, correlation_store, out_file=None, funcs='linlsq', props=None, **kwargs): """ Constructor for the correlation builder. Args: propnet_store: (Mongolike Store) store instance pointing to propnet collection with read access mp_store: (Mongolike Store) store instance pointing to Materials Project collection with read access correlation_store: (Mongolike Store) store instance pointing to collection with write access out_file: (str) optional, filename to output data in JSON format (useful if using a MemoryStore for correlation_store) funcs: (str, function, list<str, function>) functions to use for correlation. Built-in functions can be specified by the following strings: linlsq (default): linear least-squares, reports R^2 pearson: Pearson r-correlation, reports r spearman: Spearman rank correlation, reports r mic: maximal-information non-parametric exploration, reports maximal information coefficient ransac: random sample consensus (RANSAC) regression, reports score theilsen: Theil-Sen regression, reports score all: runs all correlation functions above **kwargs: arguments to the Builder superclass """ self.propnet_store = propnet_store self.mp_store = mp_store self.correlation_store = correlation_store self.out_file = out_file self._correlation_funcs = { f.replace('_cfunc_', ''): getattr(self, f) for f in dir(self) if re.match(r'^_cfunc_.+$', f) and callable(getattr(self, f)) } self._funcs = {} if not isinstance(funcs, list): funcs = [funcs] for f in funcs: if isinstance(f, str) and f == 'all': self._funcs.update(self._correlation_funcs) elif isinstance(f, str) and f in self._correlation_funcs.keys(): self._funcs[f] = self._correlation_funcs[f] elif callable(f): name = f.__module__ + "." + f.__name__ self._funcs[name] = f else: raise ValueError("Invalid correlation function: {}".format(f)) if not self._funcs: raise ValueError("No valid correlation functions selected") mp_prop_map = {(p.split(".")[1] if len(p.split(".")) == 2 else p): p for p in self.MP_QUERY_PROPS} self._props = props if not props: self.mp_query_props = self.MP_QUERY_PROPS self.mp_props = list(mp_prop_map.keys()) self.propnet_props = self.PROPNET_PROPS else: self.propnet_props = [] self.mp_props = [] self.mp_query_props = [] if isinstance(props, str): props = [props] for p in props: if p in self.PROPNET_PROPS: self.propnet_props.append(p) elif p in mp_prop_map.keys(): self.mp_props.append(p) self.mp_query_props.append(mp_prop_map[p]) super(CorrelationBuilder, self).__init__(sources=[propnet_store, mp_store], targets=[correlation_store], **kwargs) def get_items(self): """ Collects scalar data from propnet and MP databases, aggregates it by property, and creates a generator to iterate over all pairs of properties, including pairing of the same property with itself for sanity check, and correlation functions. Returns: (generator) a generator providing a dictionary with the data for correlation: {'x_data': (list<float>) data for independent property (x-axis), 'x_name': (str) name of independent property, 'y_data': (list<float>) data for dependent property (y-axis), 'y_name': (str) name of dependent property, 'func': (tuple<str, function>) name and function handle for correlation function } """ data = defaultdict(dict) propnet_data = self.propnet_store.query( criteria={}, properties=[p + '.mean' for p in self.propnet_props] + [p + '.units' for p in self.propnet_props] + [p + '.quantities' for p in self.propnet_props] + ['task_id', 'inputs']) for material in propnet_data: mpid = material['task_id'] input_d = defaultdict(list) for q in material['inputs']: if q['symbol_type'] in self.propnet_props: this_q = ureg.Quantity(q['value'], q['units']) input_d[q['symbol_type']].append(this_q) for prop, values in material.items(): if prop in self.propnet_props: if prop in input_d.keys(): for q in values['quantities']: input_d[prop].append( ureg.Quantity(q['value'], q['units'])) else: this_q = ureg.Quantity(values['mean'], values['units']) input_d[prop] = [this_q] data[mpid].update({k: sum(v) / len(v) for k, v in input_d.items()}) # TODO: Add these symbols to propnet so we don't have to bring them in explicitly? mp_data = self.mp_store.query(criteria={}, properties=self.mp_query_props + ['task_id']) for material in mp_data: mpid = material['task_id'] for prop, value in material.items(): if isinstance(value, dict): for sub_prop, sub_value in value.items(): if prop + '.' + sub_prop in self.mp_query_props and sub_value is not None: data[mpid][sub_prop] = sub_value elif prop in self.mp_query_props and value is not None: data[mpid][prop] = value # product() produces all possible combinations of properties for prop_x, prop_y in product(self.propnet_props + self.mp_props, repeat=2): x = [] y = [] for props_data in data.values(): if prop_x in props_data.keys() and prop_y in props_data.keys(): x.append(props_data[prop_x]) y.append(props_data[prop_y]) # MP data does not have units listed in database, so will be floats. propnet # data may not have the same units as the MP data, so is stored as pint # quantities. Here, the quantities are coerced into the units of MP data # as stored in symbols and coverts them to floats. if x and any(isinstance(v, ureg.Quantity) for v in x): x_float = [ xx.to(Registry("symbols")[prop_x].units).magnitude if isinstance(xx, ureg.Quantity) else xx for xx in x ] else: x_float = x if y and any(isinstance(v, ureg.Quantity) for v in y): y_float = [ yy.to(Registry("symbols")[prop_y].units).magnitude if isinstance(yy, ureg.Quantity) else yy for yy in y ] else: y_float = y for name, func in self._funcs.items(): data_dict = { 'x_data': x_float, 'x_name': prop_x, 'y_data': y_float, 'y_name': prop_y, 'func': (name, func) } yield data_dict def process_item(self, item): """ Run correlation calculation on a pair of properties using the specified function. Args: item: (dict) input provided by get_items() (see get_items() for structure) Returns: (tuple<str, str, float, str, int>) output of calculation with necessary information about calculation included. Format in tuple: independent property (x-axis) name, dependent property (y-axis) name, correlation value, correlation function name, number of data points used for correlation length of shortest path between properties on propnet graph where x-axis property is starting property and y-axis property is ending property. Note: if no (forward) connection exists, the path length will be None. This does not preclude y->x having a forward path. """ prop_x, prop_y = item['x_name'], item['y_name'] data_x, data_y = item['x_data'], item['y_data'] func_name, func = item['func'] n_points = len(data_x) g = Graph() try: path_length = g.get_degree_of_separation(prop_x, prop_y) except ValueError: path_length = None if n_points < 2: correlation = 0.0 else: correlation = func(data_x, data_y) return prop_x, prop_y, correlation, func_name, n_points, path_length @staticmethod def _cfunc_mic(x, y): """ Get maximal information coefficient for data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) maximal information coefficient """ from minepy import MINE m = MINE() m.compute_score(x, y) return m.mic() @staticmethod def _cfunc_linlsq(x, y): """ Get R^2 value for linear least-squares fit of a data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) R^2 value """ from scipy import stats fit = stats.linregress(x, y) return fit.rvalue**2 @staticmethod def _cfunc_pearson(x, y): """ Get R value for Pearson fit of a data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) Pearson R value """ from scipy import stats fit = stats.pearsonr(x, y) return fit[0] @staticmethod def _cfunc_spearman(x, y): """ Get R value for Spearman fit of a data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) Spearman R value """ from scipy import stats fit = stats.spearmanr(x, y) return fit[0] @staticmethod def _cfunc_ransac(x, y): """ Get random sample consensus (RANSAC) regression score for data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) RANSAC score """ from sklearn.linear_model import RANSACRegressor r = RANSACRegressor(random_state=21) x_coeff = np.array(x)[:, np.newaxis] r.fit(x_coeff, y) return r.score(x_coeff, y) @staticmethod def _cfunc_theilsen(x, y): """ Get Theil-Sen regression score for data set. Args: x: (list<float>) independent property (x-axis) y: (list<float>) dependent property (y-axis) Returns: (float) Theil-Sen score """ from sklearn.linear_model import TheilSenRegressor r = TheilSenRegressor(random_state=21) x_coeff = np.array(x)[:, np.newaxis] r.fit(x_coeff, y) return r.score(x_coeff, y) def update_targets(self, items): """ Write correlation data to Mongo store. Args: items: (list<dict>) list of results output by process_item() """ data = [] for item in items: prop_x, prop_y, correlation, func_name, n_points, path_length = item data.append({ 'property_x': prop_x, 'property_y': prop_y, 'correlation': correlation, 'correlation_func': func_name, 'n_points': n_points, 'shortest_path_length': path_length, 'id': hash((prop_x, prop_y)) ^ hash(func_name) }) self.correlation_store.update(data, key='id') def finalize(self, cursor=None): """ Outputs correlation data to JSON file, if specified in instantiation, and runs clean-up function for Builder. Args: cursor: (Mongo Store cursor) optional, cursor to close if not automatically closed. """ if self.out_file: try: self.write_correlation_data_file(self.out_file) except OSError: logger.warning( "Cannot open file for writing! Skipping file writing.") super(CorrelationBuilder, self).finalize(cursor) def write_correlation_data_file(self, out_file): """ Gets data dictionary containing correlation matrices and outputs to a file. Args: out_file: (str) file path and name for output to JSON file """ matrix = self.get_correlation_matrices() with open(out_file, 'w') as f: json.dump(matrix, f) def get_correlation_matrices(self, func_name=None): """ Builds document containing the correlation matrix with relevant data regarding correlation algorithm and properties of the data set. Args: func_name: (str) optional, name of the correlation functions to include in the document default: None, which is to include all that were run by this builder. Returns: (dict) document containing correlation data. Format: {'properties': (list<str>) names of properties calculated in order of how they are indexed in the matrices 'n_points': (list<list<int>>) list of lists (i.e. matrix) containing the number of data points evaluated during the fitting procedure 'correlation': (dict<str: list<list<float>>>) dictionary of matrices containing correlation results, keyed by correlation function name } """ prop_data = self.correlation_store.query( criteria={'property_x': { '$exists': True }}, properties=['property_x']) props = list(set(item['property_x'] for item in prop_data)) out = { 'properties': props, 'n_points': None, 'shortest_path_length': None, 'correlation': {} } if not func_name: func_name = list(self._funcs.keys()) if isinstance(func_name, str): func_name = [func_name] for f in func_name: data = self.correlation_store.query( criteria={'correlation_func': f}) corr_matrix: list = np.zeros(shape=(len(props), len(props))).tolist() fill_info_matrices = False if not out['n_points'] and not out['shortest_path_length']: fill_info_matrices = True out['n_points'] = np.zeros(shape=(len(props), len(props))).tolist() out['shortest_path_length'] = np.zeros( shape=(len(props), len(props))).tolist() for d in data: prop_x, prop_y, correlation, n_points, path_length = d['property_x'], \ d['property_y'], \ d['correlation'], \ d['n_points'], \ d['shortest_path_length'] ia, ib = props.index(prop_x), props.index(prop_y) corr_matrix[ia][ib] = correlation if fill_info_matrices: out['n_points'][ia][ib] = n_points out['n_points'][ib][ia] = n_points out['shortest_path_length'][ia][ib] = path_length out['correlation'][f] = corr_matrix return out def as_dict(self): """ Returns the representation of the builder as a dictionary in JSON serializable format. Note: because functions are not JSON serializable, custom functions are omitted when serializing the object. Returns: (dict) representation of this builder as a JSON-serializable dictionary """ d = super(CorrelationBuilder, self).as_dict() serialized_funcs = [] for name in d['funcs'].keys(): if name in self._correlation_funcs.keys(): serialized_funcs.append(name) else: logger.warning( "Cannot serialize custom function '{}'. Omitting.".format( name)) if not serialized_funcs: logger.warning( "No functions were able to be serialized from this builder.") d['funcs'] = serialized_funcs return d
def get_items(self): """ Collects scalar data from propnet and MP databases, aggregates it by property, and creates a generator to iterate over all pairs of properties, including pairing of the same property with itself for sanity check, and correlation functions. Returns: (generator) a generator providing a dictionary with the data for correlation: {'x_data': (list<float>) data for independent property (x-axis), 'x_name': (str) name of independent property, 'y_data': (list<float>) data for dependent property (y-axis), 'y_name': (str) name of dependent property, 'func': (tuple<str, function>) name and function handle for correlation function } """ data = defaultdict(dict) propnet_data = self.propnet_store.query( criteria={}, properties=[p + '.mean' for p in self.propnet_props] + [p + '.units' for p in self.propnet_props] + [p + '.quantities' for p in self.propnet_props] + ['task_id', 'inputs']) for material in propnet_data: mpid = material['task_id'] input_d = defaultdict(list) for q in material['inputs']: if q['symbol_type'] in self.propnet_props: this_q = ureg.Quantity(q['value'], q['units']) input_d[q['symbol_type']].append(this_q) for prop, values in material.items(): if prop in self.propnet_props: if prop in input_d.keys(): for q in values['quantities']: input_d[prop].append( ureg.Quantity(q['value'], q['units'])) else: this_q = ureg.Quantity(values['mean'], values['units']) input_d[prop] = [this_q] data[mpid].update({k: sum(v) / len(v) for k, v in input_d.items()}) # TODO: Add these symbols to propnet so we don't have to bring them in explicitly? mp_data = self.mp_store.query(criteria={}, properties=self.mp_query_props + ['task_id']) for material in mp_data: mpid = material['task_id'] for prop, value in material.items(): if isinstance(value, dict): for sub_prop, sub_value in value.items(): if prop + '.' + sub_prop in self.mp_query_props and sub_value is not None: data[mpid][sub_prop] = sub_value elif prop in self.mp_query_props and value is not None: data[mpid][prop] = value # product() produces all possible combinations of properties for prop_x, prop_y in product(self.propnet_props + self.mp_props, repeat=2): x = [] y = [] for props_data in data.values(): if prop_x in props_data.keys() and prop_y in props_data.keys(): x.append(props_data[prop_x]) y.append(props_data[prop_y]) # MP data does not have units listed in database, so will be floats. propnet # data may not have the same units as the MP data, so is stored as pint # quantities. Here, the quantities are coerced into the units of MP data # as stored in symbols and coverts them to floats. if x and any(isinstance(v, ureg.Quantity) for v in x): x_float = [ xx.to(Registry("symbols")[prop_x].units).magnitude if isinstance(xx, ureg.Quantity) else xx for xx in x ] else: x_float = x if y and any(isinstance(v, ureg.Quantity) for v in y): y_float = [ yy.to(Registry("symbols")[prop_y].units).magnitude if isinstance(yy, ureg.Quantity) else yy for yy in y ] else: y_float = y for name, func in self._funcs.items(): data_dict = { 'x_data': x_float, 'x_name': prop_x, 'y_data': y_float, 'y_name': prop_y, 'func': (name, func) } yield data_dict