def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(archieml.loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in rdct.keys(): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct.insert_before(rdct.keys()[0], (mp_level01_titles[0], RecursiveDict())) rdct.rec_update( nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key])) else: # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [root_key] if isinstance(value, list): keys.append('table') rdct.rec_update(nest_dict(value, keys)) # Note: CSV section is marked with 'data ' prefix during iterate() for k, v in rdct[root_key].iterate(): if isinstance(k, six.string_types) and \ k.startswith(mp_level01_titles[1]): # k = table name (incl. data prefix) # v = csv string from ArchieML free-form arrays table_name = k[len(mp_level01_titles[1] + '_'):] pd_obj = read_csv(v) rdct[root_key].pop(table_name) rdct[root_key].rec_update( nest_dict(pd_obj.to_dict(), [k])) rdct[root_key].insert_default_plot_options(pd_obj, k) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in rdct[root_key]: from pymatgen.io.cif import CifParser for name in rdct[root_key][mp_level01_titles[3]].keys(): cif = rdct[root_key][mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] rdct[root_key][mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in list(rdct.keys()): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct[mp_level01_titles[0]] = RecursiveDict() rdct.move_to_end(mp_level01_titles[0], last=False) # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [mp_level01_titles[0]] if is_general else [] keys.append(root_key) if isinstance(value, list): keys.append("table") rdct.rec_update(nest_dict(value, keys)) # reference to section to iterate or parse as CIF section = (rdct[mp_level01_titles[0]][root_key] if is_general else rdct[root_key]) # iterate to find CSV sections to parse # also parse propnet quantities if isinstance(section, dict): scope = [] for k, v in section.iterate(): level, key = k key = "".join([replacements.get(c, c) for c in key]) level_reduction = bool(level < len(scope)) if level_reduction: del scope[level:] if v is None: scope.append(key) elif isinstance(v, list) and isinstance(v[0], dict): table = "" for row_dct in v: table = "\n".join([table, row_dct["value"]]) pd_obj = read_csv(table) d = nest_dict(pd_obj.to_dict(), scope + [key]) section.rec_update(d, overwrite=True) if not is_general and level == 0: section.insert_default_plot_options(pd_obj, key) elif (Quantity is not None and isinstance(v, six.string_types) and " " in v): quantity = Quantity.from_key_value(key, v) d = nest_dict(quantity.as_dict(), scope + [key]) # TODO quantity.symbol.name section.rec_update(d, overwrite=True) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in section: from pymatgen.io.cif import CifParser for name in section[mp_level01_titles[3]].keys(): cif = section[mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] section[mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
def to_backgrid_dict(self): """Backgrid-conform dict from DataFrame""" # shorten global import times by importing django here import numpy as np from mpcontribs.io.core.utils import get_composition_from_string from pandas import MultiIndex import pymatgen.util as pmg_util from pymatgen.core.composition import CompositionError table = dict() nrows_max = 260 nrows = self.shape[0] df = Table(self.head(n=nrows_max)) if nrows > nrows_max else self numeric_columns = df.select_dtypes( include=[np.number]).columns.tolist() if isinstance(df.index, MultiIndex): df.reset_index(inplace=True) table['columns'] = [] table['rows'] = super(Table, df).to_dict(orient='records') for col_index, col in enumerate(list(df.columns)): cell_type = 'number' # avoid looping rows to minimize use of `df.iat` (time-consuming in 3d) if not col.startswith('level_') and col not in numeric_columns: is_url_column, prev_unit, old_col = True, None, col for row_index in range(df.shape[0]): cell = str(df.iat[row_index, col_index]) cell_split = cell.split(' ', 1) if not cell or len( cell_split) == 1: # empty cell or no space is_url_column = bool( is_url_column and (not cell or mp_id_pattern.match(cell))) if is_url_column: if cell: value = 'https://materialsproject.org/materials/{}'.format( cell) table['rows'][row_index][col] = value elif cell: try: composition = get_composition_from_string(cell) composition = pmg_util.string.unicodeify( composition) table['rows'][row_index][col] = composition except (CompositionError, ValueError, OverflowError): try: # https://stackoverflow.com/a/38020041 result = urlparse(cell) if not all([ result.scheme, result.netloc, result.path ]): break is_url_column = True except: break else: value, unit = cell_split # TODO convert cell_split[0] to float? is_url_column = False try: float(value ) # unit is only a unit if value is number except ValueError: continue table['rows'][row_index].pop(old_col) if prev_unit is None: prev_unit = unit col = '{} [{}]'.format(col, unit) table['rows'][row_index][ col] = cell if prev_unit != unit else value cell_type = 'uri' if is_url_column else 'string' col_split = col.split('##') nesting = [col_split[0]] if len(col_split) > 1 else [] table['columns'].append({ 'name': col, 'cell': cell_type, 'nesting': nesting, 'editable': 0 }) if len(col_split) > 1: table['columns'][-1].update( {'label': '##'.join(col_split[1:])}) if len(table['columns']) > 12: table['columns'][-1]['renderable'] = 0 header = RecursiveDict() for idx, col in enumerate(table['columns']): if 'label' in col: k, sk = col['name'].split('##') sk_split = sk.split() if len(sk_split) == 2: d = {'name': sk_split[0], 'unit': sk_split[1], 'idx': idx} if k not in header: header[k] = [d] else: header[k].append(d) elif k in header: header.pop(k) for k, skl in header.items(): units = [sk['unit'] for sk in skl] if units.count(units[0]) == len(units): for sk in skl: table['columns'][sk['idx']]['label'] = sk['name'] table['columns'][sk['idx']]['nesting'][0] = '{} {}'.format( k, sk['unit']) return table