def add_structure(self, source, name=None, identifier=None, fmt=None): """add a structure to the mpfile""" from pymatgen import Structure, MPRester if isinstance(source, Structure): structure = source elif isinstance(source, dict): structure = Structure.from_dict(source) elif os.path.exists(source): structure = Structure.from_file(source, sort=True) elif isinstance(source, six.string_types): if fmt is None: raise ValueError('Need fmt to get structure from string!') structure = Structure.from_str(source, fmt, sort=True) else: raise ValueError(source, 'not supported!') if name is not None: if not isinstance(name, six.string_types): raise ValueError('structure name needs to be a string') elif '.' in name: raise ValueError('structure name cannot contain dots (.)') mpr = MPRester() if not mpr.api_key: raise ValueError( 'API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`.' ) matched_mpids = mpr.find_structure(structure) formula = get_composition_from_string(structure.composition.formula) if not matched_mpids: if identifier is None: identifier = formula print( 'Structure not found in MP! Please submit via MPComplete to ' 'obtain mp-id or manually choose an anchor mp-id! Continuing ' 'with {} as identifier!'.format(identifier)) else: print('Structure not found in MP! Forcing {} as identifier!'. format(identifier)) elif identifier is None: identifier = matched_mpids[0] if len(matched_mpids) > 1: print('Multiple matching structures found in MP. Using', identifier) elif identifier not in matched_mpids: raise ValueError( 'Structure does not match {} but instead {}'.format( identifier, matched_mpids)) idx = len( self.document.get(identifier, {}).get(mp_level01_titles[3], {})) sub_key = formula if name is None else name if sub_key in self.document.get(identifier, {}).get(mp_level01_titles[3], {}): sub_key += '_{}'.format(idx) self.document.rec_update( nest_dict(structure.as_dict(), [identifier, mp_level01_titles[3], sub_key])) return identifier
def concat(self, mpfile): """concatenate single-section MPFile with this MPFile""" try: if len(mpfile.document) > 1: raise ValueError( 'concatenation only possible with single section files') except AttributeError: raise ValueError('Provide a MPFile to concatenate') mp_cat_id = mpfile.document.keys()[0] general_title = mp_level01_titles[0] if general_title in mpfile.document[mp_cat_id]: general_data = mpfile.document[mp_cat_id].pop(general_title) if general_title not in self.document: self.document.rec_update( nest_dict(general_data, [general_title])) self.document.rec_update( nest_dict(mpfile.document.pop(mp_cat_id), [self.get_unique_mp_cat_id(mp_cat_id)]))
def add_data_table(self, identifier, dataframe, name): """add a datatable to the root-level section Args: identifier (str): MP category ID (`mp_cat_id`) dataframe (pandas.DataFrame): tabular data as Pandas DataFrame name (str): table name, optional if only one table in section """ # TODO: optional table name, required if multiple tables per root-level section self.document.rec_update( nest_dict(pandas_to_dict(dataframe), [identifier, name]))
def parse(self, file_string): """recursively parse sections according to number of separators""" # split into section title line (even) and section body (odd entries) sections = re.split(self.separator_regex(), file_string) if len(sections) > 1: # check for preceding bare section_body (without section title), and parse if sections[0] != '': self.parse(sections[0]) # drop preceding bare section_body sections = sections[1:] # https://docs.python.org/2/library/re.html#re.split for section_index,section_body in enumerate(sections[1::2]): clean_title = self.clean_title(sections[2*section_index]) # uniquify level-0 titles if necessary if self.level == min_indent_level and clean_title in self.document: clean_title += '--%d' % self.level0_counter self.level0_counter += 1 self.increase_level(clean_title) self.parse(section_body) self.reduce_level() else: # separator level not found, convert section body to pandas object, section_title = self.section_titles[-1] is_data_section, pd_obj = self.read_csv(section_title, file_string) logging.info(pd_obj) # TODO: include validation # use first csv table for default plot, first column as x-column if is_data_section and mp_level01_titles[2] not in \ self.document[self.section_titles[0]]: self.document.rec_update(nest_dict( {'x': pd_obj.columns[0], 'table': section_title}, [self.section_titles[0], mp_level01_titles[2], 'default'] )) # add data section title to nest 'bare' data under data section # => artificially increase and decrease level (see below) is_bare_data = (is_data_section and self.is_bare_section(section_title)) if is_bare_data: self.increase_level(mp_level01_titles[1]) # update nested dict/document based on section level self.document.rec_update(nest_dict( self.to_dict(pd_obj), self.section_titles )) if is_bare_data: self.reduce_level()
def insert_general_section(self, general_mpfile): """insert general section from `general_mpfile` into this MPFile""" if general_mpfile is None: return general_title = mp_level01_titles[0] general_data = general_mpfile.document[general_title] root_key = self.document.keys()[0] first_subkey = self.document[root_key].keys()[0] for key, value in general_data.items(): if key in self.document[root_key]: self.document.rec_update(nest_dict(value, [root_key, key])) else: self.document[root_key].insert_before(first_subkey, (key, value))
def concat(self, mpfile, uniquify=True): """concatenate single-section MPFile with this MPFile""" try: if len(mpfile.document) > 1: raise ValueError( 'concatenation only possible with single section files') except AttributeError: raise ValueError('Provide a MPFile to concatenate') mp_cat_id = mpfile.document.keys()[0] general_title = mp_level01_titles[0] if general_title in mpfile.document[mp_cat_id]: general_data = mpfile.document[mp_cat_id].pop(general_title) if general_title not in self.document: self.document.rec_update( nest_dict(general_data, [general_title])) mp_cat_id_idx, mp_cat_id_uniq = 0, mp_cat_id if uniquify: while mp_cat_id_uniq in self.document.keys(): mp_cat_id_uniq = mp_cat_id + '--{}'.format(mp_cat_id_idx) mp_cat_id_idx += 1 self.document.rec_update( nest_dict(mpfile.document.pop(mp_cat_id), [mp_cat_id_uniq]))
def add_data_table(self, identifier, dataframe, name, plot_options=None): """add a datatable to the root-level section Args: identifier (str): MP category ID (`mp_cat_id`) dataframe (pandas.DataFrame): tabular data as Pandas DataFrame name (str): table name, optional if only one table in section plot_options (dict): options for according plotly graph """ # TODO: optional table name, required if multiple tables per root-level section table_start = mp_level01_titles[1] + '_' if not name.startswith(table_start): name = table_start + name name = ''.join([replacements.get(c, c) for c in name]) self.document.rec_update( nest_dict(Table(dataframe).to_dict(), [identifier, name])) self.document[identifier].insert_default_plot_options( dataframe, name, update_plot_options=plot_options)
def insert_general_section(self, general_mpfile): """insert general section from `general_mpfile` into this MPFile""" if general_mpfile is None: return general_title = mp_level01_titles[0] general_data = general_mpfile.document[general_title] root_key = self.document.keys()[0] # need to reverse-loop to keep the order of the general_mpfile for key, value in reversed(general_data.items()): if key in self.document[root_key]: self.document.rec_update(nest_dict(value, [root_key, key])) else: # this approach is due to the order sensitivity of key-value pairs # before or after a `>>>..` row in the custom format (legacy) # => ignoring it here would generate the wrong MPFile in get_string for k, v in self.document[root_key].iteritems(): if isinstance(v, dict): self.document[root_key].insert_before(k, (key, value)) break
def split(self): general_mpfile = self.pop_first_section() \ if mp_level01_titles[0] in self.document.keys() else None if not self.document: raise ValueError('No contributions in MPFile! Either the file is' ' empty or only contains shared (meta-)data not' ' correlated to core identifier.') while True: try: mpfile_single = self.pop_first_section() mpid_orig = mpfile_single.ids[0] mpid = mpid_orig.split('--')[0] mpfile_single.document.rec_update( nest_dict(mpfile_single.document.pop(mpid_orig), [mpid])) if general_mpfile is not None: mpfile_single.insert_general_section(general_mpfile) yield mpfile_single except KeyError: break
def add_structure(self, source, name=None, identifier=None, fmt=None): """add a structure to the mpfile""" if isinstance(source, Structure): structure = source elif isinstance(source, dict): structure = Structure.from_dict(source) elif os.path.exists(source): structure = Structure.from_file(source) elif isinstance(source, six.string_types): if fmt is None: raise ValueError('Need fmt to get structure from string!') structure = Structure.from_str(source, fmt) else: raise ValueError(source, 'not supported!') mpr = MPRester() if not mpr.api_key: raise ValueError( 'API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`.' ) matched_mpids = mpr.find_structure(structure) if not matched_mpids: raise ValueError( 'Structure not found in MP. Please submit via MPComplete to obtain mp-id!' ) elif identifier is None: identifier = matched_mpids[0] if len(matched_mpids) > 1: print('Multiple matching structures found in MP. Using', identifier) elif identifier not in matched_mpids: raise ValueError( 'Structure does not match {} but instead {}'.format( identifier, matched_mpids)) idx = len( self.document.get(identifier, {}).get(mp_level01_titles[3], {})) sub_key = 's{}'.format(idx) if name is None else name self.document.rec_update( nest_dict(structure.as_dict(), [identifier, mp_level01_titles[3], sub_key]))
def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]): if len(self.ids) >= self.max_contribs: raise StopIteration( 'Reached max. number of contributions in MPFile') self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier]))
def add_hierarchical_data(self, identifier, dct): self.document.rec_update(nest_dict(dct, [identifier]))