def run(mpfile, **kwargs): input_file = mpfile.document['_hdata'].pop('input_file') zip_path = os.path.join(os.environ['HOME'], 'work', input_file) if not os.path.exists(zip_path): return 'Please upload', zip_path zip_file = ZipFile(zip_path, 'r') composition_table_dict = mpfile.document['_hdata']['composition_table'] conc_funcs = get_concentration_functions(composition_table_dict) for info in zip_file.infolist(): print info.filename d = RecursiveDict() # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4) d['position'] = RecursiveDict( (k, clean_value(v, 'mm')) for k, v in zip(['x', 'y'], [x, y]) ) # composition d['composition'] = RecursiveDict( (el, clean_value(f(x, y), convert_to_percent=True)) for el, f in conc_funcs.items() ) # identifier identifier = get_composition_from_string(''.join([ '{}{}'.format(el, int(round(Decimal(comp.split()[0])))) for el, comp in d['composition'].items() ])) # load csv file try: csv = zip_file.read(info.filename) except KeyError: print 'ERROR: Did not find %s in zip file' % info.filename # read csv to pandas DataFrame and add to MPFile df = read_csv(csv) df = df[['Energy', 'XAS', 'XMCD']] # min and max d.rec_update(RecursiveDict( (y, RecursiveDict([ ('min', df[y].min()), ('max', df[y].max()) ])) for y in ['XAS', 'XMCD'] )) # add data to MPFile mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier) mpfile.add_data_table(identifier, df, name=element)
def run(mpfile, **kwargs): input_file = mpfile.document["_hdata"].pop("input_file") zip_path = os.path.join(os.environ["HOME"], "work", input_file) if not os.path.exists(zip_path): return "Please upload", zip_path zip_file = ZipFile(zip_path, "r") composition_table_dict = mpfile.document["_hdata"]["composition_table"] conc_funcs = get_concentration_functions(composition_table_dict) for info in zip_file.infolist(): print info.filename d = RecursiveDict() # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv element, x, y = os.path.splitext(info.filename)[0].rsplit("_", 4) d["position"] = RecursiveDict( (k, clean_value(v, "mm")) for k, v in zip(["x", "y"], [x, y])) # composition d["composition"] = RecursiveDict( (el, clean_value(f(x, y), convert_to_percent=True)) for el, f in conc_funcs.items()) # identifier identifier = get_composition_from_string("".join([ "{}{}".format(el, int(round(Decimal(comp.split()[0])))) for el, comp in d["composition"].items() ])) # load csv file try: csv = zip_file.read(info.filename) except KeyError: print "ERROR: Did not find %s in zip file" % info.filename # read csv to pandas DataFrame and add to MPFile df = read_csv(csv) df = df[["Energy", "XAS", "XMCD"]] # min and max d.rec_update( RecursiveDict( (y, RecursiveDict([("min", df[y].min()), ("max", df[y].max())])) for y in ["XAS", "XMCD"])) # add data to MPFile mpfile.add_hierarchical_data(nest_dict(d, ["data"]), identifier=identifier) mpfile.add_data_table(identifier, df, name=element)
def run(mpfile, **kwargs): input_file = mpfile.document['_hdata'].pop('input_file') zip_path = os.path.join(os.environ['HOME'], 'work', input_file) if not os.path.exists(zip_path): return 'Please upload', zip_path zip_file = ZipFile(zip_path, 'r') composition_table_dict = mpfile.document['_hdata']['composition_table'] conc_funcs = get_concentration_functions(composition_table_dict) for info in zip_file.infolist(): print info.filename d = RecursiveDict() # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4) d['position'] = RecursiveDict( (k, clean_value(v, 'mm')) for k, v in zip(['x', 'y'], [x, y])) # composition d['composition'] = RecursiveDict( (el, clean_value(f(x, y), convert_to_percent=True)) for el, f in conc_funcs.items()) # identifier identifier = get_composition_from_string(''.join([ '{}{}'.format(el, int(round(Decimal(comp.split()[0])))) for el, comp in d['composition'].items() ])) # load csv file try: csv = zip_file.read(info.filename) except KeyError: print 'ERROR: Did not find %s in zip file' % info.filename # read csv to pandas DataFrame and add to MPFile df = read_csv(csv) df = df[['Energy', 'XAS', 'XMCD']] # min and max d.rec_update( RecursiveDict( (y, RecursiveDict([('min', df[y].min()), ('max', df[y].max())])) for y in ['XAS', 'XMCD'])) # add data to MPFile mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier) mpfile.add_data_table(identifier, df, name=element)
def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(archieml.loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in rdct.keys(): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct.insert_before(rdct.keys()[0], (mp_level01_titles[0], RecursiveDict())) rdct.rec_update( nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key])) else: # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [root_key] if isinstance(value, list): keys.append('table') rdct.rec_update(nest_dict(value, keys)) # Note: CSV section is marked with 'data ' prefix during iterate() for k, v in rdct[root_key].iterate(): if isinstance(k, six.string_types) and \ k.startswith(mp_level01_titles[1]): # k = table name (incl. data prefix) # v = csv string from ArchieML free-form arrays table_name = k[len(mp_level01_titles[1] + '_'):] pd_obj = read_csv(v) rdct[root_key].pop(table_name) rdct[root_key].rec_update( nest_dict(pd_obj.to_dict(), [k])) rdct[root_key].insert_default_plot_options(pd_obj, k) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in rdct[root_key]: from pymatgen.io.cif import CifParser for name in rdct[root_key][mp_level01_titles[3]].keys(): cif = rdct[root_key][mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] rdct[root_key][mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
class MPFileCore(six.with_metaclass(ABCMeta, object)): """Abstract Base Class for representing a MP Contribution File""" def __init__(self, data=RecursiveDict()): if isinstance(data, dict): self.document = RecursiveDict(data) else: raise ValueError("Need dict (or inherited class) to init MPFile.") self.document.rec_update( ) # convert (most) OrderedDict's to RecursiveDict's self.unique_mp_cat_ids = True self.max_contribs = 10 def __getitem__(self, key): item = self.from_dict({key: self.document[key]}) general = self.document.get(mp_level01_titles[0]) if general: item.insert_general_section( self.from_dict({mp_level01_titles[0]: general})) return item @property def ids(self): return [ k for k in self.document.keys() if k.lower() != mp_level01_titles[0] ] @property def hdata(self): return HierarchicalData(self.document) @property def tdata(self): return TabularData(self.document) @property def gdata(self): return GraphicalData(self.document) @property def sdata(self): return StructuralData(self.document) @classmethod def from_file(cls, filename_or_file=default_mpfile_path.replace( ".txt", "_in.txt")): """Reads a MPFile from a file. Args: filename_or_file (str or file): name of file or file containing contribution data. Returns: MPFile object. """ f = (open(filename_or_file) if isinstance( filename_or_file, six.string_types) else filename_or_file) return cls.from_string(f.read()) @classmethod def from_dict(cls, data=RecursiveDict()): return cls(data=data) @classmethod def from_contribution(cls, contrib): """construct MPFile from contribution (see rest.adapter.submit_contribution)""" if "identifier" not in contrib or "content" not in contrib: raise ValueError("Dict not in contribution-style format") recdict = RecursiveDict({contrib["identifier"]: contrib["content"]}) return cls.from_dict(recdict) def write_file(self, filename=default_mpfile_path.replace(".txt", "_out.txt"), **kwargs): """Writes MPFile to a file. The supported kwargs are the same as those for the MPFile.get_string method and are passed through directly.""" with codecs.open(filename, encoding="utf-8", mode="w") as f: file_str = self.get_string(**kwargs) + "\n" f.write(file_str) print("{} ({:.3f}MB) written".format( filename, os.path.getsize(filename) / 1024.0 / 1024.0)) def get_number_of_lines(self, **kwargs): return len(self.get_string(**kwargs).split("\n")) def split(self): general_mpfile = (self.pop_first_section() if mp_level01_titles[0] in self.document.keys() else None) if not self.document: raise ValueError("No contributions in MPFile! Either the file is" " empty or only contains shared (meta-)data not" " correlated to core identifier.") while True: try: mpfile_single = self.pop_first_section() mpid_orig = mpfile_single.ids[0] if "--" in mpid_orig: mpid = mpid_orig.split("--")[0] mpfile_single.document.rec_update( nest_dict(mpfile_single.document.pop(mpid_orig), [mpid])) if general_mpfile is not None: mpfile_single.insert_general_section(general_mpfile) yield mpfile_single except KeyError: break def get_identifiers(self): """list of materials/composition identifiers as tuples w/ contribution IDs""" return [(k, self.document[k].get("cid", None)) for k in self.document if k.lower() != mp_level01_titles[0]] def pop_first_section(self): item = self.document.popitem(last=False) return self.from_dict(RecursiveDict([item])) def insert_general_section(self, general_mpfile): """insert general section from `general_mpfile` into this MPFile""" if general_mpfile is None: return general_title = mp_level01_titles[0] general_data = general_mpfile.document[general_title] root_key = list(self.document.keys())[0] for key, value in general_data.items(): if key in self.document[root_key]: self.document.rec_update(nest_dict(value, [root_key, key])) else: self.document[root_key][key] = value for key in reversed(general_data.keys()): self.document[root_key].move_to_end(key, last=False) def get_unique_mp_cat_id(self, mp_cat_id): if not self.unique_mp_cat_ids or mp_cat_id in mp_level01_titles: return mp_cat_id mp_cat_id_idx = len([i for i in self.ids if i.startswith(mp_cat_id)]) if mp_cat_id_idx == 0: return mp_cat_id return "{}--{}".format(mp_cat_id, mp_cat_id_idx) def concat(self, mpfile): """concatenate single-section MPFile with this MPFile""" try: if len(mpfile.document) > 1: raise ValueError( "concatenation only possible with single section files") except AttributeError: raise ValueError("Provide a MPFile to concatenate") mp_cat_id = list(mpfile.document.keys())[0] general_title = mp_level01_titles[0] if general_title in mpfile.document[mp_cat_id]: general_data = mpfile.document[mp_cat_id].pop(general_title) if general_title not in self.document: self.document.rec_update( nest_dict(general_data, [general_title])) self.document.rec_update( nest_dict(mpfile.document.pop(mp_cat_id), [self.get_unique_mp_cat_id(mp_cat_id)])) def insert_top(self, mp_cat_id, key, value): """insert value for `mp_cat_id` as `key: <value>` at top""" self.document[mp_cat_id][key] = str(value) self.document[mp_cat_id].move_to_end(key, last=False) def add_data_table(self, identifier, dataframe, name, plot_options=None): """add a datatable to the root-level section Args: identifier (str): MP category ID (`mp_cat_id`) dataframe (pandas.DataFrame): tabular data as Pandas DataFrame name (str): table name, optional if only one table in section plot_options (dict): options for according plotly graph """ # TODO: optional table name, required if multiple tables per root-level section name = "".join([replacements.get(c, c) for c in name]) self.document.rec_update( nest_dict(Table(dataframe).to_dict(), [identifier, name])) self.document[identifier].insert_default_plot_options( dataframe, name, update_plot_options=plot_options) def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]): if len(self.ids) >= self.max_contribs: raise StopIteration( "Reached max. number of contributions in MPFile") self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier])) def add_structure(self, source, name=None, identifier=None, fmt=None): """add a structure to the mpfile""" from pymatgen import Structure, MPRester if isinstance(source, Structure): structure = source elif isinstance(source, dict): structure = Structure.from_dict(source) elif os.path.exists(source): structure = Structure.from_file(source, sort=True) elif isinstance(source, six.string_types): if fmt is None: raise ValueError("Need fmt to get structure from string!") structure = Structure.from_str(source, fmt, sort=True) else: raise ValueError(source, "not supported!") if name is not None: if not isinstance(name, six.string_types): raise ValueError("structure name needs to be a string") elif "." in name: raise ValueError("structure name cannot contain dots (.)") mpr = MPRester() if not mpr.api_key: raise ValueError( "API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`." ) matched_mpids = mpr.find_structure(structure) formula = get_composition_from_string(structure.composition.formula) if not matched_mpids: if identifier is None: identifier = formula print( "Structure not found in MP! Please submit via MPComplete to " "obtain mp-id or manually choose an anchor mp-id! Continuing " "with {} as identifier!".format(identifier)) else: print("Structure not found in MP! Forcing {} as identifier!". format(identifier)) elif identifier is None: identifier = matched_mpids[0] if len(matched_mpids) > 1: print("Multiple matching structures found in MP. Using", identifier) elif identifier not in matched_mpids: msg = "Structure does not match {} but instead {}!".format( identifier, matched_mpids) raise ValueError(msg) idx = len( self.document.get(identifier, {}).get(mp_level01_titles[3], {})) sub_key = formula if name is None else name if sub_key in self.document.get(identifier, {}).get(mp_level01_titles[3], {}): sub_key += "_{}".format(idx) self.document.rec_update( nest_dict(structure.as_dict(), [identifier, mp_level01_titles[3], sub_key])) return identifier def __repr__(self): return self.get_string(df_head_only=True) def __str__(self): return self.get_string(df_head_only=True) def _ipython_display_(self): from IPython.display import display_html display_html(self.hdata) display_html(self.tdata) display_html(self.gdata) display_html(self.sdata) # ---------------------------- # Override these in subclasses # ---------------------------- @staticmethod def from_string(data): """Reads a MPFile from a string containing contribution data.""" return MPFileCore() def get_string(self, df_head_only=False): """Returns a string to be written as a file""" return repr(self.document)
def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in list(rdct.keys()): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct[mp_level01_titles[0]] = RecursiveDict() rdct.move_to_end(mp_level01_titles[0], last=False) # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [mp_level01_titles[0]] if is_general else [] keys.append(root_key) if isinstance(value, list): keys.append("table") rdct.rec_update(nest_dict(value, keys)) # reference to section to iterate or parse as CIF section = (rdct[mp_level01_titles[0]][root_key] if is_general else rdct[root_key]) # iterate to find CSV sections to parse # also parse propnet quantities if isinstance(section, dict): scope = [] for k, v in section.iterate(): level, key = k key = "".join([replacements.get(c, c) for c in key]) level_reduction = bool(level < len(scope)) if level_reduction: del scope[level:] if v is None: scope.append(key) elif isinstance(v, list) and isinstance(v[0], dict): table = "" for row_dct in v: table = "\n".join([table, row_dct["value"]]) pd_obj = read_csv(table) d = nest_dict(pd_obj.to_dict(), scope + [key]) section.rec_update(d, overwrite=True) if not is_general and level == 0: section.insert_default_plot_options(pd_obj, key) elif (Quantity is not None and isinstance(v, six.string_types) and " " in v): quantity = Quantity.from_key_value(key, v) d = nest_dict(quantity.as_dict(), scope + [key]) # TODO quantity.symbol.name section.rec_update(d, overwrite=True) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in section: from pymatgen.io.cif import CifParser for name in section[mp_level01_titles[3]].keys(): cif = section[mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] section[mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
class DataGenerator(object): """generate MP-like data from baseball database database: http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip """ def __init__(self): try: from faker import Faker self.fake = Faker() except: self.fake = None self.master = os.path.join(csv_database, 'Master.csv') self.player = None self.player_id = None self.player_info = None self.player_data = None def set_player(self): """retrieve player from master file as pandas.Series""" df = read_csv(self.master, index_col=0) self.player_id = self.fake.random_element(elements=df.index) self.player = df.xs(self.player_id).dropna() def _split_string_at_caps(self, string): return re.split(r'([A-Z][a-z]*)', string)[:-1] def organize_player_info(self): """organize player info into nested dict""" splits = map(self._split_string_at_caps, self.player.index) counter = Counter([ el[0] for el in splits if el ]) subsecs = [key for key,cnt in counter.iteritems() if cnt > 1] self.player_info = RecursiveDict({}) for k,v in self.player.iteritems(): keys = self._split_string_at_caps(k) nested = {keys[0]: {keys[1]: v}} if ( keys and keys[0] in subsecs ) else {'other': {k: v}} self.player_info.rec_update(nested) def generate_dataset_for_player(self): """generate a dataset for a player""" for file_name in os.listdir(csv_database): if file_name == 'Master.csv': continue try: df = read_csv(os.path.join(csv_database, file_name)) except: continue if 'playerID' not in df.columns: continue dataset = df[df['playerID']==self.player_id].dropna() if dataset.empty or dataset.shape[0] < 2: continue cols = [ col for col in dataset.columns if not dataset[col].sum() ] self.player_data = dataset.drop(cols+['playerID'], axis=1) if self.player_data is not None: break def init(self, keep_dataset=False): """call all setters for a player""" if not keep_dataset: self.set_player() self.organize_player_info() self.generate_dataset_for_player() if self.player_data is None: # try different player if no dataset found self.init(keep_dataset=keep_dataset)