def get_contributions(self): projection = {'_id': 1, 'mp_cat_id': 1, 'content': 1} docs = self.query_contributions(projection=projection) if not docs: raise Exception('No contributions found for ALS Beamline Explorer!') data = [] columns = ['formula', 'cid'] keys = RecursiveDict([ ('composition', ['Co', 'Cu', 'Ce']), #('position', ['x', 'y']), ('XAS', ['min', 'max']), ('XMCD', ['min', 'max']) ]) columns += ['##'.join([k, sk]) for k, subkeys in keys.items() for sk in subkeys] for doc in docs: mpfile = MPFile.from_contribution(doc) identifier = mpfile.ids[0] contrib = mpfile.hdata[identifier]['data'] cid_url = self.get_cid_url(doc) row = [identifier, cid_url] row += [contrib[k][sk] for k, subkeys in keys.items() for sk in subkeys] data.append((identifier, row)) return Table.from_items(data, orient='index', columns=columns)
def run(mpfile, **kwargs): input_file = mpfile.document['_hdata'].pop('input_file') zip_path = os.path.join(os.environ['HOME'], 'work', input_file) if not os.path.exists(zip_path): return 'Please upload', zip_path zip_file = ZipFile(zip_path, 'r') composition_table_dict = mpfile.document['_hdata']['composition_table'] conc_funcs = get_concentration_functions(composition_table_dict) for info in zip_file.infolist(): print info.filename d = RecursiveDict() # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4) d['position'] = RecursiveDict( (k, clean_value(v, 'mm')) for k, v in zip(['x', 'y'], [x, y]) ) # composition d['composition'] = RecursiveDict( (el, clean_value(f(x, y), convert_to_percent=True)) for el, f in conc_funcs.items() ) # identifier identifier = get_composition_from_string(''.join([ '{}{}'.format(el, int(round(Decimal(comp.split()[0])))) for el, comp in d['composition'].items() ])) # load csv file try: csv = zip_file.read(info.filename) except KeyError: print 'ERROR: Did not find %s in zip file' % info.filename # read csv to pandas DataFrame and add to MPFile df = read_csv(csv) df = df[['Energy', 'XAS', 'XMCD']] # min and max d.rec_update(RecursiveDict( (y, RecursiveDict([ ('min', df[y].min()), ('max', df[y].max()) ])) for y in ['XAS', 'XMCD'] )) # add data to MPFile mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier) mpfile.add_data_table(identifier, df, name=element)
def organize_player_info(self): """organize player info into nested dict""" splits = map(self._split_string_at_caps, self.player.index) counter = Counter([ el[0] for el in splits if el ]) subsecs = [key for key,cnt in counter.iteritems() if cnt > 1] self.player_info = RecursiveDict({}) for k,v in self.player.iteritems(): keys = self._split_string_at_caps(k) nested = {keys[0]: {keys[1]: v}} if ( keys and keys[0] in subsecs ) else {'other': {k: v}} self.player_info.rec_update(nested)
def run(mpfile): google_sheet = "https://docs.google.com/spreadsheets/d/1Wep4LZjehrxu3Cl5KJFvAAhKhP92o4K5aC-kZYjGz2o/export?format=xlsx" contcars_filepath = "bulk_CONTCARs.tar.gz" contcars = tarfile.open(contcars_filepath) df = read_excel(google_sheet) keys = df.iloc[[0]].to_dict(orient="records")[0] abbreviations = RecursiveDict() count, skipped, update = 0, 0, 0 for index, row in df[1:].iterrows(): identifier = None data = RecursiveDict() for col, value in row.iteritems(): if col == "level_0" or col == "index": continue key = keys[col] if isinstance(key, str): key = key.strip() if not key in abbreviations: abbreviations[key] = col else: key = col.strip().lower() if key == "pmgmatchid": identifier = value.strip() if identifier == "None": identifier = None name = "_".join(data["directory"].split("/")[1:]) contcar_path = "bulk_CONTCARs/{}_CONTCAR".format( data["directory"].replace("/", "_")) contcar = contcars.extractfile(contcar_path) try: if identifier == "mp-34710": identifier = "mp-24878" identifier_match = mpfile.add_structure( contcar.read().decode("utf8"), fmt="poscar", name=name, identifier=identifier, ) except Exception as ex: print(ex) continue if not identifier: identifier = identifier_match else: if isinstance(value, str): val = value.strip() else: unit = units.get(key, "") val = clean_value(value, unit=unit) if val != "None": data[key] = val mpfile.add_hierarchical_data({"data": data}, identifier=identifier) doc = {"identifier": identifier, "project": project, "content": {}} doc["content"]["data"] = mpfile.document[identifier]["data"] doc["collaborators"] = [{ "name": "Patrick Huck", "email": "*****@*****.**" }] r = db.contributions.insert_one(doc) cid = r.inserted_id print("cid:", cid) sdct = mpfile.document[identifier]["structures"][name] sdct.pop("@module") sdct.pop("@class") if sdct["charge"] is None: sdct.pop("charge") sdct["identifier"] = identifier sdct["project"] = project sdct["name"] = name sdct["cid"] = cid r = db.structures.insert_one(sdct) print("sid:", r.inserted_id) r = db.contributions.update_one( {"_id": cid}, {"$set": { "content.structures": [r.inserted_id] }}) print(r.matched_count, r.modified_count)
def from_contribution(cls, contrib): """construct MPFile from contribution (see rest.adapter.submit_contribution)""" if "identifier" not in contrib or "content" not in contrib: raise ValueError("Dict not in contribution-style format") recdict = RecursiveDict({contrib["identifier"]: contrib["content"]}) return cls.from_dict(recdict)
def from_contribution(cls, contrib): """construct MPFile from contribution (see rest.adapter.submit_contribution)""" if not 'identifier' in contrib or not 'content' in contrib: raise ValueError('Dict not in contribution-style format') recdict = RecursiveDict({contrib['identifier']: contrib['content']}) return cls.from_dict(recdict)
def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]): if len(self.ids) >= self.max_contribs: raise StopIteration('Reached max. number of contributions in MPFile') self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier]))
class MPFileCore(six.with_metaclass(ABCMeta, object)): """Abstract Base Class for representing a MP Contribution File""" def __init__(self, data=RecursiveDict()): if isinstance(data, dict): self.document = RecursiveDict(data) else: raise ValueError('Need dict (or inherited class) to init MPFile.') self.document.rec_update() # convert (most) OrderedDict's to RecursiveDict's self.unique_mp_cat_ids = True self.max_contribs = 10 def __getitem__(self, key): item = self.from_dict({key: self.document[key]}) general = self.document.get(mp_level01_titles[0]) if general: item.insert_general_section(self.from_dict({mp_level01_titles[0]: general})) return item @property def ids(self): return [ k for k in self.document.keys() if k.lower() != mp_level01_titles[0] ] @property def hdata(self): return HierarchicalData(self.document) @property def tdata(self): return TabularData(self.document) @property def gdata(self): return GraphicalData(self.document) @property def sdata(self): return StructuralData(self.document) @classmethod def from_file(cls, filename_or_file=default_mpfile_path.replace('.txt', '_in.txt')): """Reads a MPFile from a file. Args: filename_or_file (str or file): name of file or file containing contribution data. Returns: MPFile object. """ f = open(filename_or_file) \ if isinstance(filename_or_file, six.string_types) \ else filename_or_file return cls.from_string(f.read()) @classmethod def from_dict(cls, data=RecursiveDict()): return cls(data=data) @classmethod def from_contribution(cls, contrib): """construct MPFile from contribution (see rest.adapter.submit_contribution)""" if not 'identifier' in contrib or not 'content' in contrib: raise ValueError('Dict not in contribution-style format') recdict = RecursiveDict({contrib['identifier']: contrib['content']}) return cls.from_dict(recdict) def write_file(self, filename=default_mpfile_path.replace('.txt', '_out.txt'), **kwargs): """Writes MPFile to a file. The supported kwargs are the same as those for the MPFile.get_string method and are passed through directly.""" with codecs.open(filename, encoding='utf-8', mode='w') as f: file_str = self.get_string(**kwargs) + '\n' f.write(file_str) print('{} ({:.3f}MB) written'.format( filename, os.path.getsize(filename) / 1024. / 1024. )) def get_number_of_lines(self, **kwargs): return len(self.get_string(**kwargs).split('\n')) def split(self): general_mpfile = self.pop_first_section() \ if mp_level01_titles[0] in self.document.keys() else None if not self.document: raise ValueError('No contributions in MPFile! Either the file is' ' empty or only contains shared (meta-)data not' ' correlated to core identifier.') while True: try: mpfile_single = self.pop_first_section() mpid_orig = mpfile_single.ids[0] if '--' in mpid_orig: mpid = mpid_orig.split('--')[0] mpfile_single.document.rec_update(nest_dict( mpfile_single.document.pop(mpid_orig), [mpid] )) if general_mpfile is not None: mpfile_single.insert_general_section(general_mpfile) yield mpfile_single except KeyError: break def get_identifiers(self): """list of materials/composition identifiers as tuples w/ contribution IDs""" return [ (k, self.document[k].get('cid', None)) for k in self.document if k.lower() != mp_level01_titles[0] ] def pop_first_section(self): item = self.document.popitem(last=False) return self.from_dict(RecursiveDict([item])) def insert_general_section(self, general_mpfile): """insert general section from `general_mpfile` into this MPFile""" if general_mpfile is None: return general_title = mp_level01_titles[0] general_data = general_mpfile.document[general_title] root_key = list(self.document.keys())[0] for key, value in general_data.items(): if key in self.document[root_key]: self.document.rec_update(nest_dict(value, [root_key, key])) else: self.document[root_key][key] = value for key in reversed(general_data.keys()): self.document[root_key].move_to_end(key, last=False) def get_unique_mp_cat_id(self, mp_cat_id): if not self.unique_mp_cat_ids or mp_cat_id in mp_level01_titles: return mp_cat_id mp_cat_id_idx = len([i for i in self.ids if i.startswith(mp_cat_id)]) if mp_cat_id_idx == 0: return mp_cat_id return '{}--{}'.format(mp_cat_id, mp_cat_id_idx) def concat(self, mpfile): """concatenate single-section MPFile with this MPFile""" try: if len(mpfile.document) > 1: raise ValueError('concatenation only possible with single section files') except AttributeError: raise ValueError('Provide a MPFile to concatenate') mp_cat_id = list(mpfile.document.keys())[0] general_title = mp_level01_titles[0] if general_title in mpfile.document[mp_cat_id]: general_data = mpfile.document[mp_cat_id].pop(general_title) if general_title not in self.document: self.document.rec_update(nest_dict(general_data, [general_title])) self.document.rec_update(nest_dict( mpfile.document.pop(mp_cat_id), [self.get_unique_mp_cat_id(mp_cat_id)] )) def insert_top(self, mp_cat_id, key, value): """insert value for `mp_cat_id` as `key: <value>` at top""" self.document[mp_cat_id][key] = str(value) self.document[mp_cat_id].move_to_end(key, last=False) def add_data_table(self, identifier, dataframe, name, plot_options=None): """add a datatable to the root-level section Args: identifier (str): MP category ID (`mp_cat_id`) dataframe (pandas.DataFrame): tabular data as Pandas DataFrame name (str): table name, optional if only one table in section plot_options (dict): options for according plotly graph """ # TODO: optional table name, required if multiple tables per root-level section name = ''.join([replacements.get(c, c) for c in name]) self.document.rec_update(nest_dict( Table(dataframe).to_dict(), [identifier, name] )) self.document[identifier].insert_default_plot_options( dataframe, name, update_plot_options=plot_options ) def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]): if len(self.ids) >= self.max_contribs: raise StopIteration('Reached max. number of contributions in MPFile') self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier])) def add_structure(self, source, name=None, identifier=None, fmt=None): """add a structure to the mpfile""" from pymatgen import Structure, MPRester if isinstance(source, Structure): structure = source elif isinstance(source, dict): structure = Structure.from_dict(source) elif os.path.exists(source): structure = Structure.from_file(source, sort=True) elif isinstance(source, six.string_types): if fmt is None: raise ValueError('Need fmt to get structure from string!') structure = Structure.from_str(source, fmt, sort=True) else: raise ValueError(source, 'not supported!') if name is not None: if not isinstance(name, six.string_types): raise ValueError('structure name needs to be a string') elif '.' in name: raise ValueError('structure name cannot contain dots (.)') mpr = MPRester() if not mpr.api_key: raise ValueError( 'API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`.' ) matched_mpids = mpr.find_structure(structure) formula = get_composition_from_string(structure.composition.formula) if not matched_mpids: if identifier is None: identifier = formula print( 'Structure not found in MP! Please submit via MPComplete to ' 'obtain mp-id or manually choose an anchor mp-id! Continuing ' 'with {} as identifier!'.format(identifier) ) else: print('Structure not found in MP! Forcing {} as identifier!'.format(identifier)) elif identifier is None: identifier = matched_mpids[0] if len(matched_mpids) > 1: print('Multiple matching structures found in MP. Using', identifier) elif identifier not in matched_mpids: msg = 'Structure does not match {} but instead {}!'.format(identifier, matched_mpids) raise ValueError(msg) idx = len(self.document.get(identifier, {}).get(mp_level01_titles[3], {})) sub_key = formula if name is None else name if sub_key in self.document.get(identifier, {}).get(mp_level01_titles[3], {}): sub_key += '_{}'.format(idx) self.document.rec_update(nest_dict( structure.as_dict(), [identifier, mp_level01_titles[3], sub_key] )) return identifier def __repr__(self): return self.get_string(df_head_only=True) def __str__(self): return self.get_string(df_head_only=True) def _ipython_display_(self): from IPython.display import display_html display_html(self.hdata) display_html(self.tdata) display_html(self.gdata) display_html(self.sdata) # ---------------------------- # Override these in subclasses # ---------------------------- @staticmethod def from_string(data): """Reads a MPFile from a string containing contribution data.""" return MPFileCore() def get_string(self, df_head_only=False): """Returns a string to be written as a file""" return repr(self.document)
def run(mpfile, include_cifs=True, nmax=None, dup_check_test_site=True): data_input = mpfile.document[mp_level01_titles[0]].pop('input') phase_names = mpfile.hdata.general['info']['phase_names'] dir_path = os.path.dirname(os.path.realpath(__file__)) for k in data_input.keys(): data_input[k] = os.path.join(dir_path, data_input[k]) doi = mpfile.hdata.general['doi'] existing_mpids = {} for b in [False, True]: with MnO2PhaseSelectionRester(test_site=b) as mpr: for doc in mpr.query_contributions(criteria={'content.doi': doi}): existing_mpids[doc['mp_cat_id']] = doc['_id'] if not dup_check_test_site: break with open(data_input['formatted_entries'], "r") as fin: mp_contrib_phases = json.loads(fin.read()) with open(data_input['hull_entries'], "r") as fin: hull_states = json.loads(fin.read()) with open(data_input['mpid_existing'], 'r') as fin: mp_dup = json.loads(fin.read()) with open(data_input['mpid_new'], 'r') as fin: mp_cmp = json.loads(fin.read()) ################################################################################################################ # add unique structures first (special cases) ################################################################################################################ if include_cifs: for hstate in hull_states: if 'other' == hstate['phase']: c = Composition.from_dict(hstate['c']) s = Structure.from_dict(hstate['s']) for mpid in mpfile.ids: formula = mpfile.hdata[mpid]['data']['Formula'] if c.almost_equals(Composition(formula)): if nmax is not None and mpid in existing_mpids: mpfile.document.pop(mpid) # skip duplicates break try: mpfile.add_structure(s, identifier=mpid) print formula, 'added to', mpid except Exception as ex: print 'tried to add structure to', mpid, 'but', str( ex) if mpid in existing_mpids: cid = existing_mpids[mpid] mpfile.insert_id(mpid, cid) print cid, 'inserted to update', mpid break # "phase": 'postspinel-NaMn2O4', "Formula": 'Na0.5MnO2', # "dHf (eV/mol)": -1.415, "dHh (eV/mol)": '--', "Ground state?": 'Y', ################################################################################################################ # Get mp-ids for all entries based on matching the VASP directory path names # Paths are different in the existing and new mp-id dictionary, so processing has to be independent ################################################################################################################ print 'get all mp-ids based on VASP directory paths ...' for framework, fdat in mp_contrib_phases.items(): for i, phase in enumerate(fdat): c = Composition(phase[0]) for hstate in hull_states: if phase_names[framework] == hstate['phase'] and \ c.almost_equals(Composition.from_dict(hstate['c'])) and \ len(mp_contrib_phases[framework][i]) < 6: mp_contrib_phases[framework][i].append(hstate['path']) mp_contrib_phases[framework][i].append(hstate['s']) for framework, fdat in mp_contrib_phases.items(): for i, phase in enumerate(fdat): match_path = phase[4].replace('all_states/', '') mp_ids = [] for path, ids in mp_dup.items(): mp_path = path.replace( '/Users/patrick/Downloads/20160710_MPContrib_MnO2_DK/', '').replace('/3.double_relax/CONTCAR', '') if match_path == mp_path: mp_ids.extend(ids) for path, id_dat in mp_cmp.items(): mp_path = path.replace( '20160725_MnO2_DK_Cifs/20160710_MPContrib_MnO2_DK-', '').replace('-3.double_relax-CONTCAR.cif', '').replace('-', '/') if match_path == mp_path: if 'mp_id' in id_dat.keys(): mp_ids.append(id_dat['mp_id']) mp_contrib_phases[framework][i].append(mp_ids) ################################################################################################################ # For structures that have mp-ids, add them to the contribution dictionary. # For those that don't, run a separate dictionary to keep track of them ################################################################################################################ print 'add structures with mp-ids to contribution ...' no_id_dict = {} errors_file = os.path.join(os.path.dirname(__file__), 'errors.json') with open(errors_file, 'r') as f: errors = json.load(f) for framework, fdat in mp_contrib_phases.items(): for phase in fdat: d = RecursiveDict() d["Phase"] = framework d["Formula"] = phase[0] try: float(phase[1]) d["dHf"] = '{} eV/mol'.format(phase[1]) except: d["dHf"] = '--' try: float(phase[3]) d["dHh"] = '{} eV/mol'.format(phase[3]) except: d["dHh"] = '--' d["GS"] = phase[2] if len(phase[6]) == 0: no_id_dict[phase[4].replace('all_states/', '')] = d for mpid in phase[6]: if nmax is not None: if len(mpfile.ids) >= nmax - 1: break elif mpid in existing_mpids: continue # skip duplicates mpfile.add_hierarchical_data(RecursiveDict({'data': d}), identifier=mpid) print 'added', mpid if mpid in existing_mpids: cid = existing_mpids[mpid] mpfile.insert_id(mpid, cid) print cid, 'inserted to update', mpid if include_cifs: try: mpfile.add_structure(phase[5], identifier=mpid) print framework, phase[0], 'added to', mpid except ValueError as ex: print 'tried to add structure to', mpid, 'but', str(ex) errors[mpid] = str(ex) with open(errors_file, 'w') as f: json.dump(errors, f) print """ DONE. {} structures to submit. {} structures do not have mp-ids. {} structures with mp-ids have errors. """.format(len(mpfile.ids), len(no_id_dict), len(errors))
def run(mpfile, hosts=None, download=False): mpr = MPRester() fpath = f"{project}.xlsx" if download or not os.path.exists(fpath): figshare_id = 1546772 url = "https://api.figshare.com/v2/articles/{}".format(figshare_id) print("get figshare article {}".format(figshare_id)) r = requests.get(url) figshare = json.loads(r.content) print("version =", figshare["version"]) # TODO set manually in "other"? print("read excel from figshare into DataFrame") df_dct = None for d in figshare["files"]: if "xlsx" in d["name"]: # Dict of DataFrames is returned, with keys representing sheets df_dct = read_excel(d["download_url"], sheet_name=None) break if df_dct is None: print("no excel sheet found on figshare") return print("save excel to disk") writer = ExcelWriter(fpath) for sheet, df in df_dct.items(): df.to_excel(writer, sheet) writer.save() else: df_dct = read_excel(fpath, sheet_name=None) print(len(df_dct), "sheets loaded.") print("looping hosts ...") host_info = df_dct["Host Information"] host_info.set_index(host_info.columns[0], inplace=True) host_info.dropna(inplace=True) for idx, host in enumerate(host_info): if hosts is not None: if isinstance(hosts, int) and idx + 1 > hosts: break elif isinstance(hosts, list) and not host in hosts: continue print("get mp-id for {}".format(host)) mpid = None for doc in mpr.query(criteria={"pretty_formula": host}, properties={"task_id": 1}): if "decomposes_to" not in doc["sbxd"][0]: mpid = doc["task_id"] break if mpid is None: print("mp-id for {} not found".format(host)) continue print("add host info for {}".format(mpid)) hdata = host_info[host].to_dict(into=RecursiveDict) for k in list(hdata.keys()): v = hdata.pop(k) ks = k.split() if ks[0] not in hdata: hdata[ks[0]] = RecursiveDict() unit = ks[-1][1:-1] if ks[-1].startswith("[") else "" subkey = "_".join(ks[1:-1] if unit else ks[1:]).split(",")[0] if subkey == "lattice_constant": unit = "Å" try: hdata[ks[0]][subkey] = clean_value( v, unit.replace("angstrom", "Å")) except ValueError: hdata[ks[0]][subkey] = v hdata["formula"] = host df = df_dct["{}-X".format(host)] rows = list(isnull(df).any(1).nonzero()[0]) if rows: cells = df.iloc[rows].dropna(how="all").dropna( axis=1)[df.columns[0]] note = cells.iloc[0].replace("following", cells.iloc[1])[:-1] hdata["note"] = note df.drop(rows, inplace=True) mpfile.add_hierarchical_data(nest_dict(hdata, ["data"]), identifier=mpid) print("add table for D₀/Q data for {}".format(mpid)) df.set_index(df["Solute element number"], inplace=True) df.drop("Solute element number", axis=1, inplace=True) df.columns = df.iloc[0] df.index.name = "index" df.drop("Solute element name", inplace=True) df = df.T.reset_index() if str(host) == "Fe": df_D0_Q = df[[ "Solute element name", "Solute D0, paramagnetic [cm^2/s]", "Solute Q, paramagnetic [eV]", ]] elif hdata["Host"]["crystal_structure"] == "HCP": df_D0_Q = df[[ "Solute element name", "Solute D0 basal [cm^2/s]", "Solute Q basal [eV]", ]] else: df_D0_Q = df[[ "Solute element name", "Solute D0 [cm^2/s]", "Solute Q [eV]" ]] df_D0_Q.columns = ["Solute", "D₀ [cm²/s]", "Q [eV]"] anums = [z[el] for el in df_D0_Q["Solute"]] df_D0_Q.insert(0, "Z", Series(anums, index=df_D0_Q.index)) df_D0_Q.sort_values("Z", inplace=True) df_D0_Q.reset_index(drop=True, inplace=True) mpfile.add_data_table(mpid, df_D0_Q, "D₀_Q") if hdata["Host"]["crystal_structure"] == "BCC": print("add table for hop activation barriers for {} (BCC)".format( mpid)) columns_E = ([ "Hop activation barrier, E_{} [eV]".format(i) for i in range(2, 5) ] + [ "Hop activation barrier, E'_{} [eV]".format(i) for i in range(3, 5) ] + [ "Hop activation barrier, E''_{} [eV]".format(i) for i in range(3, 5) ] + [ "Hop activation barrier, E_{} [eV]".format(i) for i in range(5, 7) ]) df_E = df[["Solute element name"] + columns_E] df_E.columns = (["Solute"] + ["E{} [eV]".format(i) for i in ["₂", "₃", "₄"]] + ["E`{} [eV]".format(i) for i in ["₃", "₄"]] + ["E``{} [eV]".format(i) for i in ["₃", "₄"]] + ["E{} [eV]".format(i) for i in ["₅", "₆"]]) mpfile.add_data_table(mpid, df_E, "hop_activation_barriers") print("add table for hop attempt frequencies for {} (BCC)".format( mpid)) columns_v = ([ "Hop attempt frequency, v_{} [THz]".format(i) for i in range(2, 5) ] + [ "Hop attempt frequency, v'_{} [THz]".format(i) for i in range(3, 5) ] + [ "Hop attempt frequency, v''_{} [THz]".format(i) for i in range(3, 5) ] + [ "Hop attempt frequency, v_{} [THz]".format(i) for i in range(5, 7) ]) df_v = df[["Solute element name"] + columns_v] df_v.columns = (["Solute"] + ["v{} [THz]".format(i) for i in ["₂", "₃", "₄"]] + ["v`{} [THz]".format(i) for i in ["₃", "₄"]] + ["v``{} [THz]".format(i) for i in ["₃", "₄"]] + ["v{} [THz]".format(i) for i in ["₅", "₆"]]) mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies") elif hdata["Host"]["crystal_structure"] == "FCC": print("add table for hop activation barriers for {} (FCC)".format( mpid)) columns_E = [ "Hop activation barrier, E_{} [eV]".format(i) for i in range(5) ] df_E = df[["Solute element name"] + columns_E] df_E.columns = ["Solute"] + [ "E{} [eV]".format(i) for i in ["₀", "₁", "₂", "₃", "₄"] ] mpfile.add_data_table(mpid, df_E, "hop_activation_barriers") print("add table for hop attempt frequencies for {} (FCC)".format( mpid)) columns_v = [ "Hop attempt frequency, v_{} [THz]".format(i) for i in range(5) ] df_v = df[["Solute element name"] + columns_v] df_v.columns = ["Solute"] + [ "v{} [THz]".format(i) for i in ["₀", "₁", "₂", "₃", "₄"] ] mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies") elif hdata["Host"]["crystal_structure"] == "HCP": print("add table for hop activation barriers for {} (HCP)".format( mpid)) columns_E = [ "Hop activation barrier, E_X [eV]", "Hop activation barrier, E'_X [eV]", "Hop activation barrier, E_a [eV]", "Hop activation barrier, E'_a [eV]", "Hop activation barrier, E_b [eV]", "Hop activation barrier, E'_b [eV]", "Hop activation barrier, E_c [eV]", "Hop activation barrier, E'_c [eV]", ] df_E = df[["Solute element name"] + columns_E] df_E.columns = ["Solute"] + [ "Eₓ [eV]", "E`ₓ [eV]", "Eₐ [eV]", "E`ₐ [eV]", "E_b [eV]", "E`_b [eV]", "Eꪱ [eV]", "E`ꪱ [eV]", ] mpfile.add_data_table(mpid, df_E, "hop_activation_barriers") print("add table for hop attempt frequencies for {} (HCP)".format( mpid)) columns_v = ["Hop attempt frequency, v_a [THz]" ] + ["Hop attempt frequency, v_X [THz]"] df_v = df[["Solute element name"] + columns_v] df_v.columns = ["Solute"] + ["vₐ [THz]"] + ["vₓ [THz]"] mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies") print("DONE")
def to_backgrid_dict(self): """Backgrid-conform dict from DataFrame""" # shorten global import times by importing django here import numpy as np from mpcontribs.io.core.utils import get_composition_from_string from pandas import MultiIndex import pymatgen.util as pmg_util from pymatgen.core.composition import CompositionError table = dict() nrows_max = 260 nrows = self.shape[0] df = Table(self.head(n=nrows_max)) if nrows > nrows_max else self numeric_columns = df.select_dtypes( include=[np.number]).columns.tolist() if isinstance(df.index, MultiIndex): df.reset_index(inplace=True) table['columns'] = [] table['rows'] = super(Table, df).to_dict(orient='records') for col_index, col in enumerate(list(df.columns)): cell_type = 'number' # avoid looping rows to minimize use of `df.iat` (time-consuming in 3d) if not col.startswith('level_') and col not in numeric_columns: is_url_column, prev_unit, old_col = True, None, col for row_index in range(df.shape[0]): cell = str(df.iat[row_index, col_index]) cell_split = cell.split(' ', 1) if not cell or len( cell_split) == 1: # empty cell or no space is_url_column = bool( is_url_column and (not cell or mp_id_pattern.match(cell))) if is_url_column: if cell: value = 'https://materialsproject.org/materials/{}'.format( cell) table['rows'][row_index][col] = value elif cell: try: composition = get_composition_from_string(cell) composition = pmg_util.string.unicodeify( composition) table['rows'][row_index][col] = composition except (CompositionError, ValueError, OverflowError): try: # https://stackoverflow.com/a/38020041 result = urlparse(cell) if not all([ result.scheme, result.netloc, result.path ]): break is_url_column = True except: break else: value, unit = cell_split # TODO convert cell_split[0] to float? is_url_column = False try: float(value ) # unit is only a unit if value is number except ValueError: continue table['rows'][row_index].pop(old_col) if prev_unit is None: prev_unit = unit col = '{} [{}]'.format(col, unit) table['rows'][row_index][ col] = cell if prev_unit != unit else value cell_type = 'uri' if is_url_column else 'string' col_split = col.split('##') nesting = [col_split[0]] if len(col_split) > 1 else [] table['columns'].append({ 'name': col, 'cell': cell_type, 'nesting': nesting, 'editable': 0 }) if len(col_split) > 1: table['columns'][-1].update( {'label': '##'.join(col_split[1:])}) if len(table['columns']) > 12: table['columns'][-1]['renderable'] = 0 header = RecursiveDict() for idx, col in enumerate(table['columns']): if 'label' in col: k, sk = col['name'].split('##') sk_split = sk.split() if len(sk_split) == 2: d = {'name': sk_split[0], 'unit': sk_split[1], 'idx': idx} if k not in header: header[k] = [d] else: header[k].append(d) elif k in header: header.pop(k) for k, skl in header.items(): units = [sk['unit'] for sk in skl] if units.count(units[0]) == len(units): for sk in skl: table['columns'][sk['idx']]['label'] = sk['name'] table['columns'][sk['idx']]['nesting'][0] = '{} {}'.format( k, sk['unit']) return table
def __init__(self, content=RecursiveDict()): super(Tables, self).__init__((key, value) for key, value in content.items() if isinstance(value, Table))
def run(mpfile, **kwargs): # extract data from json files keys = ['pretty_formula', 'volume'] input_dir = mpfile.hdata.general['input_dir'] for idx, obj in enumerate(scandir(input_dir)): mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1] print(mpid) input_file = gzip.open(obj.path, 'rb') try: data = json.loads(input_file.read()) # filter out metals if 'GGA' not in data['gap'] or data['gap']['GGA'] < 0.1: print('GGA gap < 0.1 -> skip') continue # add hierarchical data (nested key-values) # TODO: extreme values for power factor, zT, effective mass # TODO: add a text for the description of each table hdata = RecursiveDict((k, data[k]) for k in keys) hdata['volume'] = u'{:g} ų'.format(hdata['volume']) cond_eff_mass = u'mₑᶜᵒⁿᵈ' hdata[cond_eff_mass] = RecursiveDict() names = [u'e₁', u'e₂', u'e₃', u'<m>'] if 'GGA' not in data: print('no GGA key for', mpid) continue for dt, d in data['GGA']['cond_eff_mass'].items(): eff_mass = d['300']['1e+18'] eff_mass.append(np.mean(eff_mass)) hdata[cond_eff_mass][dt] = RecursiveDict( (names[idx], u'{:.2f} mₑ'.format(x)) for idx, x in enumerate(eff_mass)) seebeck_fix_dop_temp = "Seebeck" hdata[seebeck_fix_dop_temp] = RecursiveDict() cols = [u'e₁', u'e₂', u'e₃', 'temperature', 'doping'] for doping_type in ['p', 'n']: sbk = [ float(i) for i in data['GGA']['seebeck_doping'] [doping_type]['300']['1e+18']['eigs'] ] vals = [u'{:.2e} μV/K'.format(s) for s in sbk] + [ u'{} K'.format('300'), u'{} cm⁻³'.format('1e+18') ] hdata[seebeck_fix_dop_temp][doping_type] = RecursiveDict( (k, v) for k, v in zip(cols, vals)) # build data and max values for seebeck, conductivity and kappa # max/min values computed using numpy. It may be better to code it in pure python. cols = ['value', 'temperature', 'doping'] for prop_name in ['seebeck_doping', 'cond_doping', 'kappa_doping']: # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font) # and select it as standard font in your browser (leave other fonts as is, esp. fixed width) if prop_name[0] == 's': lbl, unit = u"Sₘₐₓ", u"μV/K" elif prop_name[0] == 'c': lbl, unit = u"σₘₐₓ", u"(Ωms)⁻¹" elif prop_name[0] == 'k': lbl, unit = u"κₑ₋ₘᵢₙ", u"W/(mKs)" hdata[lbl] = RecursiveDict() for doping_type in ['p', 'n']: prop = data['GGA'][prop_name][doping_type] prop_averages, dopings, columns = [], None, ['T (K)'] temps = sorted(map(int, prop.keys())) for temp in temps: row = [temp] if dopings is None: dopings = sorted(map(float, prop[str(temp)].keys())) for doping in dopings: doping_str = '%.0e' % doping if len(columns) <= len(dopings): columns.append(doping_str + u' cm⁻³') eigs = prop[str(temp)][doping_str]['eigs'] row.append(np.mean(eigs)) prop_averages.append(row) arr_prop_avg = np.array(prop_averages)[:, 1:] max_v = np.max(arr_prop_avg) if prop_name[0] == 's' and doping_type == 'n': max_v = np.min(arr_prop_avg) if prop_name[0] == 'k': max_v = np.min(arr_prop_avg) arg_max = np.argwhere(arr_prop_avg == max_v)[0] vals = [ u'{:.2e} {}'.format(max_v, unit), u'{:.2e} K'.format(temps[arg_max[0]]), u'{:.2e} cm⁻³'.format(dopings[arg_max[1]]) ] hdata[lbl][doping_type] = RecursiveDict( (k, v) for k, v in zip(cols, vals)) mpfile.add_hierarchical_data(nest_dict(hdata, ['data']), identifier=data['mp_id']) finally: input_file.close()
def build(self, contributor_email, cid, api_key=None, endpoint=None): """update materials/compositions collections with contributed data""" cid_short, cid_str = get_short_object_id(cid), str(cid) contrib = self.find_contribution(cid) if not contrib: raise Exception('Contribution {} not found!'.format(cid)) if contributor_email not in contrib['collaborators']: raise ValueError( "Build stopped: building contribution {} not " "allowed due to insufficient permissions of {}! Ask " "someone of {} to make you a collaborator on {}.".format( cid_short, contributor_email, contrib['collaborators'], cid_short)) from pymatgen.util.provenance import Author mpfile = MPFileCore.from_contribution(contrib) mp_cat_id = mpfile.ids[0] is_mp_id = mp_id_pattern.match(mp_cat_id) self.curr_coll = self.materials if is_mp_id else self.compositions author = Author.parse_author(contributor_email) project = str(author.name).translate(None, '.') \ if 'project' not in contrib else contrib['project'] nb = nbf.new_notebook() if isinstance(self.db, dict): contrib.pop('_id') if 'cid' in contrib['content']: contrib['content'].pop('cid') nb['cells'].append( nbf.new_code_cell( "from mpcontribs.io.core.mpfile import MPFileCore\n" "from mpcontribs.io.core.recdict import RecursiveDict\n" "mpfile = MPFileCore.from_contribution({})\n" "identifier = '{}'".format(contrib, mp_cat_id))) else: nb['cells'].append( nbf.new_code_cell( "from mpcontribs.rest.rester import MPContribsRester")) os.environ['PMG_MAPI_KEY'] = api_key os.environ['PMG_MAPI_ENDPOINT'] = endpoint nb['cells'].append( nbf.new_code_cell( "with MPContribsRester() as mpr:\n" " mpfile = mpr.find_contribution('{}')\n" " identifier = mpfile.ids[0]".format(cid))) nb['cells'].append( nbf.new_markdown_cell("## Contribution #{} for {}".format( cid_short, mp_cat_id))) nb['cells'].append(nbf.new_markdown_cell("### Hierarchical Data")) nb['cells'].append(nbf.new_code_cell("mpfile.hdata[identifier]")) if mpfile.tdata[mp_cat_id]: nb['cells'].append(nbf.new_markdown_cell("### Tabular Data")) for table_name, table in mpfile.tdata[mp_cat_id].iteritems(): nb['cells'].append( nbf.new_markdown_cell("#### {}".format(table_name))) nb['cells'].append( nbf.new_code_cell( "mpfile.tdata[identifier]['{}']".format(table_name))) if mpfile.gdata[mp_cat_id]: nb['cells'].append(nbf.new_markdown_cell("### Graphical Data")) for plot_name, plot in mpfile.gdata[mp_cat_id].iteritems(): nb['cells'].append( nbf.new_markdown_cell("#### {}".format(plot_name))) nb['cells'].append( nbf.new_code_cell( "mpfile.gdata[identifier]['{}']".format(plot_name))) if mpfile.sdata[mp_cat_id]: nb['cells'].append(nbf.new_markdown_cell("### Structural Data")) for structure_name, structure in mpfile.sdata[mp_cat_id].iteritems(): nb['cells'].append( nbf.new_markdown_cell("#### {}".format(structure_name))) nb['cells'].append( nbf.new_code_cell( "mpfile.sdata[identifier]['{}']".format(structure_name))) self.ep.preprocess(nb, {'metadata': {'path': self.nbdir}}) if isinstance(self.db, dict): return [mp_cat_id, project, cid_short, export_notebook(nb, cid)] else: build_doc = RecursiveDict() build_doc['mp_cat_id'] = mp_cat_id build_doc['project'] = project build_doc['nb'] = nb self.curr_coll.update({'_id': cid}, {'$set': build_doc}, upsert=True) return '{}/{}'.format( # return URL for contribution page ('materials' if is_mp_id else 'compositions'), cid_str)
def run(mpfile, hosts=None, download=False, **kwargs): #mpfile.unique_mp_cat_ids = False from pymatgen import MPRester mpr = MPRester() fpath = os.path.join(os.environ['HOME'], 'work', 'dilute_solute_diffusion.xlsx') if download or not os.path.exists(fpath): figshare_id = mpfile.hdata.general['info']['figshare_id'] url = 'https://api.figshare.com/v2/articles/{}'.format(figshare_id) print 'get figshare article {}'.format(figshare_id) r = requests.get(url) figshare = json.loads(r.content) mpfile.document['_hdata']['version'] = figshare['version'] print 'read excel from figshare into DataFrame' df_dct = None for d in figshare['files']: if 'xlsx' in d['name']: # Dict of DataFrames is returned, with keys representing sheets df_dct = read_excel(d['download_url'], sheet_name=None) break if df_dct is None: print 'no excel sheet found on figshare' return print 'save excel to disk' writer = ExcelWriter(fpath) for sheet, df in df_dct.items(): df.to_excel(writer, sheet) writer.save() else: df_dct = read_excel(fpath, sheet_name=None) print len(df_dct), 'sheets loaded.' print 'looping hosts ...' host_info = df_dct['Host Information'] host_info.set_index(host_info.columns[0], inplace=True) host_info.dropna(inplace=True) for idx, host in enumerate(host_info): if hosts is not None: if isinstance(hosts, int) and idx + 1 > hosts: break elif isinstance(hosts, list) and not host in hosts: continue print 'get mp-id for {}'.format(host) mpid = None for doc in mpr.query(criteria={'pretty_formula': host}, properties={'task_id': 1}): if doc['sbxd'][0]['decomposes_to'] is None: mpid = doc['task_id'] break if mpid is None: print 'mp-id for {} not found'.format(host) continue print 'add host info for {}'.format(mpid) hdata = host_info[host].to_dict(into=RecursiveDict) for k in hdata.keys(): v = hdata.pop(k) ks = k.split() if ks[0] not in hdata: hdata[ks[0]] = RecursiveDict() unit = ks[-1][1:-1] if ks[-1].startswith('[') else '' subkey = '_'.join(ks[1:-1] if unit else ks[1:]).split(',')[0] if subkey == "lattice_constant": unit = u'Å' try: hdata[ks[0]][subkey] = clean_value( v, unit.replace('angstrom', u'Å')) except ValueError: hdata[ks[0]][subkey] = v hdata['formula'] = host df = df_dct['{}-X'.format(host)] rows = list(isnull(df).any(1).nonzero()[0]) if rows: cells = df.ix[rows].dropna(how='all').dropna(axis=1)[df.columns[0]] note = cells.iloc[0].replace('following', cells.iloc[1])[:-1] hdata['note'] = note df.drop(rows, inplace=True) mpfile.add_hierarchical_data(nest_dict(hdata, ['data']), identifier=mpid) print 'add table for D₀/Q data for {}'.format(mpid) df.set_index(df['Solute element number'], inplace=True) df.drop('Solute element number', axis=1, inplace=True) df.columns = df.ix[0] df.index.name = 'index' df.drop('Solute element name', inplace=True) df = df.T.reset_index() if str(host) == 'Fe': df_D0_Q = df[[ 'Solute element name', 'Solute D0, paramagnetic [cm^2/s]', 'Solute Q, paramagnetic [eV]' ]] elif hdata['Host']['crystal_structure'] == 'HCP': df_D0_Q = df[[ 'Solute element name', 'Solute D0 basal [cm^2/s]', 'Solute Q basal [eV]' ]] else: df_D0_Q = df[[ 'Solute element name', 'Solute D0 [cm^2/s]', 'Solute Q [eV]' ]] df_D0_Q.columns = ['El.', 'D₀ [cm²/s]', 'Q [eV]'] mpfile.add_data_table(mpid, df_D0_Q, 'D₀_Q') if hdata['Host']['crystal_structure'] == 'BCC': print 'add table for hop activation barriers for {} (BCC)'.format( mpid) columns_E = [ 'Hop activation barrier, E_{} [eV]'.format(i) for i in range(2, 5) ] + [ "Hop activation barrier, E'_{} [eV]".format(i) for i in range(3, 5) ] + [ "Hop activation barrier, E''_{} [eV]".format(i) for i in range(3, 5) ] + [ 'Hop activation barrier, E_{} [eV]'.format(i) for i in range(5, 7) ] df_E = df[['Solute element name'] + columns_E] df_E.columns = ['El.'] + [ 'E{} [eV]'.format(i) for i in ['₂', '₃', '₄'] ] + ['E`{} [eV]'.format(i) for i in ['₃', '₄']] + [ 'E``{} [eV]'.format(i) for i in ['₃', '₄'] ] + ['E{} [eV]'.format(i) for i in ['₅', '₆']] mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers') print 'add table for hop attempt frequencies for {} (BCC)'.format( mpid) columns_v = [ 'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(2, 5) ] + [ "Hop attempt frequency, v'_{} [THz]".format(i) for i in range(3, 5) ] + [ "Hop attempt frequency, v''_{} [THz]".format(i) for i in range(3, 5) ] + [ 'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(5, 7) ] df_v = df[['Solute element name'] + columns_v] df_v.columns = ['El.'] + [ 'v{} [THz]'.format(i) for i in ['₂', '₃', '₄'] ] + ['v``{} [THz]'.format(i) for i in ['₃', '₄']] + [ 'v``{} [THz]'.format(i) for i in ['₃', '₄'] ] + ['v{} [THz]'.format(i) for i in ['₅', '₆']] mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies') elif hdata['Host']['crystal_structure'] == 'FCC': print 'add table for hop activation barriers for {} (FCC)'.format( mpid) columns_E = [ 'Hop activation barrier, E_{} [eV]'.format(i) for i in range(5) ] df_E = df[['Solute element name'] + columns_E] df_E.columns = ['El.'] + [ 'E{} [eV]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄'] ] mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers') print 'add table for hop attempt frequencies for {} (FCC)'.format( mpid) columns_v = [ 'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(5) ] df_v = df[['Solute element name'] + columns_v] df_v.columns = ['El.'] + [ 'v{} [THz]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄'] ] mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies') elif hdata['Host']['crystal_structure'] == 'HCP': print 'add table for hop activation barriers for {} (HCP)'.format( mpid) columns_E = [ "Hop activation barrier, E_X [eV]", "Hop activation barrier, E'_X [eV]", "Hop activation barrier, E_a [eV]", "Hop activation barrier, E'_a [eV]", "Hop activation barrier, E_b [eV]", "Hop activation barrier, E'_b [eV]", "Hop activation barrier, E_c [eV]", "Hop activation barrier, E'_c [eV]" ] df_E = df[['Solute element name'] + columns_E] df_E.columns = ['El.'] + [ 'Eₓ [eV]', 'E`ₓ [eV]', 'Eₐ [eV]', 'E`ₐ [eV]', 'E_b [eV]', 'E`_b [eV]', 'Eꪱ [eV]', 'E`ꪱ [eV]' ] mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers') print 'add table for hop attempt frequencies for {} (HCP)'.format( mpid) columns_v = ['Hop attempt frequency, v_a [THz]' ] + ['Hop attempt frequency, v_X [THz]'] df_v = df[['Solute element name'] + columns_v] df_v.columns = ['El.'] + ['vₐ [THz]'] + ['vₓ [THz]'] mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies') print mpfile print 'DONE'
def run(mpfile, nmax=None, dup_check_test_site=True): existing_mpids = {} for b in [False, True]: with DibbsRester(test_site=b) as mpr: for doc in mpr.query_contributions(criteria=mpr.dibbs_query): existing_mpids[doc['mp_cat_id']] = doc['_id'] if not dup_check_test_site: break general = mpfile.document[mp_level01_titles[0]] input_file = general.pop('input_file') df = read_excel(input_file) columns_map = RecursiveDict([ (v, k) for k, v in general.pop('columns_map').items() ]) columns = columns_map.keys() df = df[columns] df = df[notnull(df[columns[-1]]) & notnull(df[columns[1]])] mpfile.add_hierarchical_data({'title': 'DIBBS - 27Al NMR'}) count, skipped, update = 0, 0, 0 for idx, row in df.iterrows(): url = row[columns[-1]] if not url.startswith('http'): continue # hierarchical data d = RecursiveDict() for col in columns[:4]: d[columns_map[col]] = unidecode(row[col]) \ if isinstance(row[col], six.string_types) else row[col] if d['name'] in [ 'AlPO4 Tridymite (AlPO4-t)', 'Basic aluminum sulfate', 'Cryolite', 'berlinite(AlPO4-Q)' ]: continue d['data'] = RecursiveDict() for col in columns[4:8]: if notnull(row[col]): value = unicode('{}'.format(row[col]), 'utf-8') if col == columns[4]: value += ' ppm' elif col == columns[6]: value += ' MHz' elif col == columns[7]: value = ' '.join([value[:-1], value[-1]]) else: value = u'' d['data'][columns_map[col]] = value # structure if url.startswith('https://materialsproject.org'): mpid = url.split('/')[-2] else: #print 'retrieve cif and match to MP structure ...' d[columns_map[columns[-1]]] = url f = requests.get(url) try: mpid = mpfile.add_structure(f.text, name=d['name'], fmt='cif') except ValueError as ex: print d['name'], str(ex) continue if nmax is not None and mpid in existing_mpids: item = mpfile.document.popitem(last=True) print 'removed duplicate', mpid if nmax is not None and mpid in existing_mpids: print 'skipping', mpid skipped += 1 continue # skip duplicates mpfile.add_hierarchical_data(d, identifier=mpid) print 'added {} ({})'.format(d['name'], mpid) if mpid in existing_mpids: cid = existing_mpids[mpid] mpfile.insert_id(mpid, cid) update += 1 if nmax is not None and count >= nmax - 1: break count += 1 print len(mpfile.ids), 'mp-ids to submit.' if nmax is None and update > 0: print update, 'mp-ids to update.' if nmax is not None and skipped > 0: print skipped, 'duplicates to skip.'
class DataGenerator(object): """generate MP-like data from baseball database database: http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip """ def __init__(self): try: from faker import Faker self.fake = Faker() except: self.fake = None self.master = os.path.join(csv_database, 'Master.csv') self.player = None self.player_id = None self.player_info = None self.player_data = None def set_player(self): """retrieve player from master file as pandas.Series""" df = read_csv(self.master, index_col=0) self.player_id = self.fake.random_element(elements=df.index) self.player = df.xs(self.player_id).dropna() def _split_string_at_caps(self, string): return re.split(r'([A-Z][a-z]*)', string)[:-1] def organize_player_info(self): """organize player info into nested dict""" splits = map(self._split_string_at_caps, self.player.index) counter = Counter([ el[0] for el in splits if el ]) subsecs = [key for key,cnt in counter.iteritems() if cnt > 1] self.player_info = RecursiveDict({}) for k,v in self.player.iteritems(): keys = self._split_string_at_caps(k) nested = {keys[0]: {keys[1]: v}} if ( keys and keys[0] in subsecs ) else {'other': {k: v}} self.player_info.rec_update(nested) def generate_dataset_for_player(self): """generate a dataset for a player""" for file_name in os.listdir(csv_database): if file_name == 'Master.csv': continue try: df = read_csv(os.path.join(csv_database, file_name)) except: continue if 'playerID' not in df.columns: continue dataset = df[df['playerID']==self.player_id].dropna() if dataset.empty or dataset.shape[0] < 2: continue cols = [ col for col in dataset.columns if not dataset[col].sum() ] self.player_data = dataset.drop(cols+['playerID'], axis=1) if self.player_data is not None: break def init(self, keep_dataset=False): """call all setters for a player""" if not keep_dataset: self.set_player() self.organize_player_info() self.generate_dataset_for_player() if self.player_data is None: # try different player if no dataset found self.init(keep_dataset=keep_dataset)
def run(mpfile, **kwargs): indir = "/Users/patrick/Downloads/ThinFilmPV" summary_data = json.load(open(os.path.join(indir, "SUMMARY.json"), "r")) absorption_data = json.load( open(os.path.join(indir, "ABSORPTION-CLIPPED.json"), "r")) dos_data = json.load(open(os.path.join(indir, "DOS.json"), "r")) formulae_data = json.load( open(os.path.join(indir, "FORMATTED-FORMULAE.json"), "r")) config = RecursiveDict([ ("SLME_500_nm", ["SLME|500nm", "%"]), ("SLME_1000_nm", ["SLME|1000nm", "%"]), ("E_g", ["ΔE.corrected", "eV"]), ("E_g_d", ["ΔE.direct", "eV"]), ("E_g_da", ["ΔE.dipole-allowed", "eV"]), ("m_e", ["mᵉ", "mₑ"]), ("m_h", ["mʰ", "mₑ"]), ]) print(len(summary_data.keys())) for mp_id, d in summary_data.items(): print(mp_id) formula = formulae_data[mp_id].replace("<sub>", "").replace("</sub>", "") query = {"identifier": mp_id, "project": "screening_inorganic_pv"} # r = db.contributions.update_one(query, {'$set': {'content.data.formula': formula}}) # print(r.modified_count) # continue rd = RecursiveDict({"formula": formula}) for k, v in config.items(): value = clean_value(d[k], v[1], max_dgts=4) if not "." in v[0]: rd[v[0]] = value else: keys = v[0].split(".") if not keys[0] in rd: rd[keys[0]] = RecursiveDict({keys[1]: value}) else: rd[keys[0]][keys[1]] = value mpfile.add_hierarchical_data({"data": rd}, identifier=mp_id) doc = query.copy() doc["content.data"] = mpfile.document[mp_id]["data"] doc["collaborators"] = [{ "name": "Patrick Huck", "email": "*****@*****.**" }] # r = db.contributions.update_one(query, {'$set': doc}, upsert=True) # cid = r.upserted_id cid = db.contributions.find_one(query, {"_id": 1})["_id"] df = DataFrame(data=absorption_data[mp_id]) df.columns = ["hν [eV]", "α [cm⁻¹]"] mpfile.add_data_table(mp_id, df, "absorption") table = mpfile.document[mp_id]["absorption"] table.pop("@module") table.pop("@class") table["identifier"] = mp_id table["project"] = "screening_inorganic_pv" table["name"] = "absorption" table["cid"] = cid # r = db.tables.insert_one(table) # tids = [r.inserted_id] r = db.tables.update_one( { "identifier": mp_id, "project": "screening_inorganic_pv", "name": "absorption", "cid": cid, }, {"$set": table}, ) print(len(table["data"]), r.modified_count)
def from_items(cls, rdct, **kwargs): return super(Table, cls).from_dict(RecursiveDict(rdct), **kwargs)
def run(mpfile, include_cifs=True, **kwargs): from pymatgen.core.composition import Composition from pymatgen.core.structure import Structure data_input = mpfile.document[mp_level01_titles[0]].pop('input') phase_names = mpfile.hdata.general['phase_names'] dir_path = os.path.dirname(os.path.realpath(__file__)) for k in data_input.keys(): data_input[k] = os.path.join(dir_path, data_input[k]) with open(data_input['formatted_entries'], "r") as fin: mp_contrib_phases = json.loads(fin.read()) with open(data_input['hull_entries'], "r") as fin: hull_states = json.loads(fin.read()) with open(data_input['mpid_existing'], 'r') as fin: mp_dup = json.loads(fin.read()) with open(data_input['mpid_new'], 'r') as fin: mp_cmp = json.loads(fin.read()) ################################################################################################################ # add unique structures first (special cases) ################################################################################################################ if include_cifs: for hstate in hull_states: if 'other' == hstate['phase']: c = Composition.from_dict(hstate['c']) s = Structure.from_dict(hstate['s']) for mpid in mpfile.ids: formula = mpfile.hdata[mpid]['data']['Formula'] if c.almost_equals(Composition(formula)): try: mpfile.add_structure(s, identifier=mpid) print formula, 'added to', mpid except Exception as ex: print 'tried to add structure to', mpid, 'but', str( ex) break # "phase": 'postspinel-NaMn2O4', "Formula": 'Na0.5MnO2', # "ΔH (eV/mol)": -1.415, "ΔHₕ (eV/mol)": '', "Ground state?": 'Y', ################################################################################################################ # Get mp-ids for all entries based on matching the VASP directory path names # Paths are different in the existing and new mp-id dictionary, so processing has to be independent ################################################################################################################ print 'get all mp-ids based on VASP directory paths ...' for framework, fdat in mp_contrib_phases.items(): for i, phase in enumerate(fdat): c = Composition(phase[0]) for hstate in hull_states: if phase_names[framework] == hstate['phase'] and \ c.almost_equals(Composition.from_dict(hstate['c'])) and \ len(mp_contrib_phases[framework][i]) < 6: mp_contrib_phases[framework][i].append(hstate['path']) mp_contrib_phases[framework][i].append(hstate['s']) for framework, fdat in mp_contrib_phases.items(): for i, phase in enumerate(fdat): match_path = phase[4].replace('all_states/', '') mp_ids = [] for path, ids in mp_dup.items(): mp_path = path.replace( '/Users/patrick/Downloads/20160710_MPContrib_MnO2_DK/', '').replace('/3.double_relax/CONTCAR', '') if match_path == mp_path: mp_ids.extend(ids) for path, id_dat in mp_cmp.items(): mp_path = path.replace( '20160725_MnO2_DK_Cifs/20160710_MPContrib_MnO2_DK-', '').replace('-3.double_relax-CONTCAR.cif', '').replace('-', '/') if match_path == mp_path: if 'mp_id' in id_dat.keys(): mp_ids.append(id_dat['mp_id']) mp_contrib_phases[framework][i].append(mp_ids) ################################################################################################################ # For structures that have mp-ids, add them to the contribution dictionary. # For those that don't, run a separate dictionary to keep track of them ################################################################################################################ print 'add structures with mp-ids to contribution ...' no_id_dict = {} for framework, fdat in mp_contrib_phases.items(): for phase in fdat: d = RecursiveDict() d["Phase"] = framework d["Formula"] = phase[0] try: float(phase[1]) d["ΔH"] = clean_value(phase[1], 'eV/mol') except: d["ΔH"] = 'N/A eV/mol' try: float(phase[3]) d["ΔHₕ"] = clean_value(phase[3], 'eV/mol') except: d["ΔHₕ"] = 'N/A eV/mol' d["GS"] = 'Yes' if phase[2] == 'Y' else 'No' if len(phase[6]) == 0: print 'no id for', d['Formula'], d['Phase'] no_id_dict[phase[4].replace('all_states/', '')] = d for mpid in phase[6]: if include_cifs: try: mpfile.add_structure(phase[5], identifier=mpid) print framework, phase[0], 'added to', mpid except ValueError as ex: print 'tried to add structure to', mpid, 'but', str(ex) mpfile.add_hierarchical_data(RecursiveDict({'data': d}), identifier=mpid) print 'added', mpid
def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in list(rdct.keys()): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct[mp_level01_titles[0]] = RecursiveDict() rdct.move_to_end(mp_level01_titles[0], last=False) # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [mp_level01_titles[0]] if is_general else [] keys.append(root_key) if isinstance(value, list): keys.append("table") rdct.rec_update(nest_dict(value, keys)) # reference to section to iterate or parse as CIF section = (rdct[mp_level01_titles[0]][root_key] if is_general else rdct[root_key]) # iterate to find CSV sections to parse # also parse propnet quantities if isinstance(section, dict): scope = [] for k, v in section.iterate(): level, key = k key = "".join([replacements.get(c, c) for c in key]) level_reduction = bool(level < len(scope)) if level_reduction: del scope[level:] if v is None: scope.append(key) elif isinstance(v, list) and isinstance(v[0], dict): table = "" for row_dct in v: table = "\n".join([table, row_dct["value"]]) pd_obj = read_csv(table) d = nest_dict(pd_obj.to_dict(), scope + [key]) section.rec_update(d, overwrite=True) if not is_general and level == 0: section.insert_default_plot_options(pd_obj, key) elif (Quantity is not None and isinstance(v, six.string_types) and " " in v): quantity = Quantity.from_key_value(key, v) d = nest_dict(quantity.as_dict(), scope + [key]) # TODO quantity.symbol.name section.rec_update(d, overwrite=True) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in section: from pymatgen.io.cif import CifParser for name in section[mp_level01_titles[3]].keys(): cif = section[mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] section[mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
def run(mpfile, nmax=None, dup_check_test_site=True): existing_mpids = {} for b in [False, True]: with DibbsRester(test_site=b) as mpr: for doc in mpr.query_contributions(criteria=mpr.dibbs_query): existing_mpids[doc['mp_cat_id']] = doc['_id'] if not dup_check_test_site: break general = mpfile.document[mp_level01_titles[0]] input_file = general.pop('input_file') df = read_excel(input_file) columns_map = RecursiveDict([ (v, k) for k, v in general.pop('columns_map').items() ]) columns = columns_map.keys() df = df[columns] df = df[notnull(df[columns[-1]]) & notnull(df[columns[1]])] mpfile.add_hierarchical_data({'title': 'DIBBS - 27Al NMR'}) count, skipped, update = 0, 0, 0 for idx, row in df.iterrows(): url = row[columns[-1]] if not url.startswith('http'): continue # hierarchical data d = RecursiveDict() for col in columns[:4]: d[columns_map[col]] = unidecode(row[col]) \ if isinstance(row[col], six.string_types) else row[col] if d['name'] in [ 'AlPO4 Tridymite (AlPO4-t)', 'Basic aluminum sulfate', 'Cryolite', 'berlinite(AlPO4-Q)' ]: continue d['data'] = RecursiveDict() for col in columns[4:8]: if notnull(row[col]): value = unicode('{}'.format(row[col]), 'utf-8') if col == columns[4]: value += ' ppm' elif col == columns[6]: value += ' MHz' elif col == columns[7]: value = ' '.join([value[:-1], value[-1]]) else: value = u'' d['data'][columns_map[col]] = value # structure if url.startswith('https://materialsproject.org'): mpid = url.split('/')[-2] else: #print 'retrieve cif and match to MP structure ...' d[columns_map[columns[-1]]] = url f = requests.get(url) try: mpid = mpfile.add_structure(f.text, name=d['name'], fmt='cif') except ValueError as ex: print d['name'], str(ex) continue if nmax is not None and mpid in existing_mpids: item = mpfile.document.popitem(last=True) print 'removed duplicate', mpid if nmax is not None and mpid in existing_mpids: print 'skipping', mpid skipped += 1 continue # skip duplicates mpfile.add_hierarchical_data(d, identifier=mpid) print 'added {} ({})'.format(d['name'], mpid) if mpid in existing_mpids: cid = existing_mpids[mpid] mpfile.insert_id(mpid, cid) update += 1 if nmax is not None and count >= nmax-1: break count += 1 print len(mpfile.ids), 'mp-ids to submit.' if nmax is None and update > 0: print update, 'mp-ids to update.' if nmax is not None and skipped > 0: print skipped, 'duplicates to skip.'
def run(mpfile, nmax=None, dup_check_test_site=True): existing_mpids = {} for b in [False, True]: with PerovskitesDiffusionRester(test_site=b) as mpr: for doc in mpr.query_contributions(projection={ 'content.data.directory': 1, 'mp_cat_id': 1 }): key = '_'.join( [doc['mp_cat_id'], doc['content']['data']['directory']]) existing_mpids[key] = doc['_id'] if not dup_check_test_site: break general = mpfile.document[mp_level01_titles[0]] google_sheet = general.pop('google_sheet') + '/export?format=xlsx' contcars_filepath = general.pop('contcars_filepath') contcars = tarfile.open(contcars_filepath) df = read_excel(google_sheet) keys = df.iloc[[0]].to_dict(orient='records')[0] abbreviations = RecursiveDict() count, skipped, update = 0, 0, 0 for index, row in df[1:].iterrows(): mpid = None data = RecursiveDict() mpfile_single = MPFile() for col, value in row.iteritems(): if col == 'level_0' or col == 'index': continue key = keys[col] if isinstance(key, string_types): key = key.strip() if not key in abbreviations: abbreviations[key] = col else: key = col.strip().lower() if key == 'pmgmatchid': mpid = value.strip() if mpid == 'None': mpid = None name = '_'.join(data['directory'].split('/')[1:]) contcar_path = 'bulk_CONTCARs/{}_CONTCAR'.format( data['directory'].replace('/', '_')) contcar = contcars.extractfile(contcar_path) mpid_match = mpfile_single.add_structure(contcar.read(), fmt='poscar', name=name, identifier=mpid) if not mp_id_pattern.match(mpid_match): print 'skipping', name continue mpid = mpid_match else: data[key] = value if mpid is None: continue mpid_mod = '_'.join([mpid, data['directory']]) if nmax is not None and mpid_mod in existing_mpids: print 'skipping', mpid_mod skipped += 1 continue # skip duplicates mpfile_single.add_hierarchical_data({'data': data}, identifier=mpid) if mpid_mod in existing_mpids: cid = existing_mpids[mpid_mod] mpfile_single.insert_id(mpid, cid) update += 1 mpfile.concat(mpfile_single) if nmax is not None and count >= nmax - 1: break count += 1 mpfile.add_hierarchical_data({'abbreviations': abbreviations}) print len(mpfile.ids), 'mp-ids to submit.' if nmax is None and update > 0: print update, 'mp-ids to update.' if nmax is not None and skipped > 0: print skipped, 'duplicates to skip.'
def get_card(request, cid, db_type=None, mdb=None): """ @api {post} /card/:cid?API_KEY=:api_key Contribution Card/Preview @apiVersion 0.2.0 @apiName PostGetCard @apiGroup Contribution @apiDescription Either returns a string containing html for hierarchical data, or if existent, a list of URLs for static versions of embedded graphs. @apiParam {String} api_key User's unique API_KEY @apiParam {json} provenance_keys List of provenance keys @apiSuccess {String} created_at Response timestamp @apiSuccess {Bool} valid_response Response is valid @apiSuccess {String} response Response preview of h- or t-data/graphs ("card") @apiSuccessExample Success-Response: HTTP/1.1 200 OK { "created_at": "2017-08-09T19:59:59.936618", "valid_response": true, "response": ["<graph-url>"] } """ from mpcontribs.io.core.components import Tree, Plots, render_plot from mpcontribs.io.core.utils import nested_dict_iter from mpcontribs.io.core.recdict import RecursiveDict, render_dict from django.template import Template, Context from django.core.urlresolvers import reverse from mpcontribs.config import mp_id_pattern prov_keys = loads(request.POST.get('provenance_keys', '["title"]')) contrib = mdb.contrib_ad.query_contributions( {'_id': ObjectId(cid)}, projection={'_id': 0, 'mp_cat_id': 1, 'content': 1, 'collaborators': 1} )[0] mpid = contrib['mp_cat_id'] hdata = Tree(contrib['content']) plots = Plots(contrib['content']) title = hdata.get('title', 'No title available.') descriptions = hdata.get('description', 'No description available.').strip().split('.', 1) description = '{}.'.format(descriptions[0]) if len(descriptions) > 1 and descriptions[1]: description += '''<a href="#" class="read_more">More »</a><span class="more_text" hidden>{}</span>'''.format(descriptions[1]) authors = hdata.get('authors', 'No authors available.').split(',', 1) provenance = '<h5>{}'.format(authors[0]) if len(authors) > 1: provenance += '''<button class="btn-sm btn-link" type=button data-toggle="tooltip" data-placement="bottom" data-container="body" title="{}" style="padding: 0px 0px 0px 3px;" >et al.</a>'''.format(authors[1].strip()) provenance += '</h5>' dois = hdata.get('dois', hdata.get('urls', '')).split(' ') doi_urls = [] for x in dois: if x.startswith('http'): doi_urls.append(x) else: doi_urls.append('https://doi.org/{}'.format(x)) provenance += ''.join(['''<a href={} class="btn btn-link" role=button style="padding: 0" target="_blank"><i class="fa fa-book fa-border fa-lg"></i></a>'''.format(x, y) for x, y in zip(doi_urls, dois) if x ]) #if plots: # card = [] # for name, plot in plots.items(): # filename = '{}_{}.png'.format(mpid, name) # cwd = os.path.dirname(__file__) # filepath = os.path.abspath(os.path.join( # cwd, '..', '..', 'webtzite', 'static', 'img', filename # )) # if not os.path.exists(filepath): # render_plot(plot, filename=filepath) # index = request.build_absolute_uri(reverse('webtzite_index')[:-1]) # imgdir = '/'.join([index.rsplit('/', 1)[0], 'static', 'img']) # fileurl = '/'.join([imgdir, filename]) # card.append(fileurl) #else: data = RecursiveDict() for idx, (k,v) in enumerate(hdata.get('data', {}).items()): data[k] = v if idx >= 6: break # humans can grasp 7 items quickly data = render_dict(data, webapp=True) is_mp_id = mp_id_pattern.match(mpid) collection = 'materials' if is_mp_id else 'compositions' more = reverse('mpcontribs_explorer_contribution', args=[collection, cid]) card = ''' <div class="panel panel-default"> <div class="panel-heading"> <h4 class="panel-title"> {} <a class="btn-sm btn-default pull-right" role="button" style=" margin-top:-6px;" href="{}" target="_blank">More Info</a> </h4> </div> <div class="panel-body" style="padding-left: 0px"> <div class="col-md-8" style="padding-top: 0px"> <blockquote class="blockquote" style="font-size: 13px;">{}</blockquote> </div> <div class="col-md-4 well" style="padding: 0px 0px 5px 5px;">{}</div> <div class="col-md-12" style="padding-right: 0px;">{}</div> </div> </div> <script> requirejs(['main'], function() {{ require(['jquery'], function() {{ $(function(){{ $("a.read_more").click(function(event){{ event.preventDefault(); $(this).parents(".blockquote").find(".more_text").show(); $(this).parents(".blockquote").find(".read_more").hide(); }}); }}); }}); }}); </script> '''.format( title, more, description, provenance, data ) return {"valid_response": True, "response": card}
def run(mpfile, **kwargs): # extract data from json files input_dir = mpfile.hdata.general['input_dir'] for idx, obj in enumerate(scandir(input_dir)): mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1] print(mpid) input_file = gzip.open(obj.path, 'rb') try: data = json.loads(input_file.read()) # filter out metals if 'GGA' not in data or 'GGA' not in data['gap'] or data['gap']['GGA'] < 0.1: print('GGA gap < 0.1 -> skip') continue # add hierarchical data (nested key-values) hdata = RecursiveDict() T, lvl, S2 = '300', '1e+18', None pf_key = 'S²σ' hdata['temperature'] = T + ' K' hdata['doping_level'] = lvl + ' cm⁻³' variables = [ {'key': 'cond_eff_mass', 'name': 'mₑᶜᵒⁿᵈ', 'unit': 'mₑ'}, {'key': 'seebeck_doping', 'name': 'S', 'unit': 'μV/K'}, {'key': 'cond_doping', 'name': 'σ', 'unit': '(Ωms)⁻¹'}, ] eigs_keys = ['ε₁', 'ε₂', 'ε₃', '<ε>'] for v in variables: hdata[v['name']] = RecursiveDict() for doping_type in ['p', 'n']: if doping_type in data['GGA'][v['key']]: d = data['GGA'][v['key']][doping_type][T][lvl] eigs = map(float, d if isinstance(d, list) else d['eigs']) hdata[v['name']][doping_type] = RecursiveDict( (eigs_keys[neig], clean_value(eig, v['unit'])) for neig, eig in enumerate(eigs) ) hdata[v['name']][doping_type][eigs_keys[-1]] = clean_value(np.mean(eigs), v['unit']) if v['key'] == 'seebeck_doping': S2 = np.dot(d['tensor'], d['tensor']) elif v['key'] == 'cond_doping': pf = np.mean(np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8 if pf_key not in hdata: hdata[pf_key] = RecursiveDict() hdata[pf_key][doping_type] = {eigs_keys[-1]: clean_value(pf, 'μW/(cmK²s)')} mpfile_data = nest_dict(hdata, ['data']) # build data and max values for seebeck, conductivity and kappa # max/min values computed using numpy. It may be better to code it in pure python. keys = ['pretty_formula', 'volume'] hdata = RecursiveDict((k, data[k]) for k in keys) hdata['volume'] = clean_value(hdata['volume'], 'ų') hdata['bandgap'] = clean_value(data['gap']['GGA'], 'eV') cols = ['value', 'temperature', 'doping'] tables = RecursiveDict() props = RecursiveDict() props['seebeck_doping'] = ['S', 'μV/K'] props['cond_doping'] = ['σ', '(Ωms)⁻¹'] props['kappa_doping'] = ['κₑ', 'W/(mKs)'] for prop_name, (lbl, unit) in props.iteritems(): # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font) # and select it as standard font in your browser (leave other fonts as is, esp. fixed width) tables[lbl] = RecursiveDict() hlbl = lbl+'₋' if len(lbl) > 1 else lbl hlbl += 'ₑₓₜᵣ' hdata[hlbl] = RecursiveDict() for doping_type in ['p', 'n']: prop = data['GGA'][prop_name][doping_type] prop_averages, dopings, columns = [], None, ['T [K]'] temps = sorted(map(int, prop.keys())) for temp in temps: row = [temp] if dopings is None: dopings = sorted(map(float, prop[str(temp)].keys())) for doping in dopings: doping_str = '%.0e' % doping if len(columns) <= len(dopings): columns.append('{} cm⁻³ [{}]'.format(doping_str, unit)) eigs = prop[str(temp)][doping_str]['eigs'] row.append(np.mean(eigs)) prop_averages.append((temp, row)) tables[lbl][doping_type] = Table.from_items( prop_averages, orient='index', columns=columns ) arr_prop_avg = np.array([item[1] for item in prop_averages])[:,1:] max_v = np.max(arr_prop_avg) if prop_name[0] == 's' and doping_type == 'n': max_v = np.min(arr_prop_avg) if prop_name[0] == 'k': max_v = np.min(arr_prop_avg) arg_max = np.argwhere(arr_prop_avg==max_v)[0] vals = [ clean_value(max_v, unit), clean_value(temps[arg_max[0]], 'K'), clean_value(dopings[arg_max[1]], 'cm⁻³') ] hdata[hlbl][doping_type] = RecursiveDict( (k, v) for k, v in zip(cols, vals) ) mpfile_data.rec_update(nest_dict(hdata, ['extra_data'])) mpfile.add_hierarchical_data(mpfile_data, identifier=data['mp_id']) for lbl, dct in tables.iteritems(): for doping_type, table in dct.iteritems(): mpfile.add_data_table( data['mp_id'], table, name='{}({})'.format(lbl, doping_type) ) finally: input_file.close()
def run(mpfile, **kwargs): # TODO clone solar_perovskite if needed, abort if insufficient permissions try: import solar_perovskite from solar_perovskite.core import GetExpThermo from solar_perovskite.init.find_structures import FindStructures from solar_perovskite.init.import_data import Importdata from solar_perovskite.modelling.from_theo import EnthTheo except ImportError: print("could not import solar_perovskite, clone github repo") sys.exit(1) input_files = mpfile.hdata.general["input_files"] input_dir = os.path.dirname(solar_perovskite.__file__) input_file = os.path.join(input_dir, input_files["exp"]) exp_table = read_csv(open(input_file, "r").read().replace(";", ",")) print("exp data loaded.") with open(os.path.join(input_dir, input_files["theo"]), "r") as f: theo_data = json.loads(f.read()).pop("collection") print("theo data loaded.") with open(input_files["energy"], "r") as f: data = json.load(f).pop("collection") print("energy data loaded.") l = [ dict(sdoc, parameters=doc["_id"]) for doc in data for sdoc in doc["energy_analysis"] ] frame = pd.DataFrame(l) parameters = frame["parameters"] frame.drop(labels=["parameters"], axis=1, inplace=True) frame.insert(0, "parameters", parameters) print("energy dataframe:", frame.shape) mpfile_singles = [m for m in mpfile.split()] for mpfile_single in mpfile_singles: identifier = mpfile_single.ids[0] # if identifier in run.existing_identifiers: # print (not updating', identifier) # continue if identifier != "mp-1076585": continue hdata = mpfile_single.hdata[identifier] print(identifier) print("add hdata ...") d = RecursiveDict() d["data"] = RecursiveDict() compstr = hdata["pars"]["theo_compstr"] row = exp_table.loc[exp_table["theo_compstr"] == compstr] if not row.empty: sample_number = int(row.iloc[0]["sample_number"]) d["pars"] = get_fit_pars(sample_number) d["data"]["availability"] = "Exp+Theo" else: d["pars"] = RecursiveDict() d["data"]["availability"] = "Theo" # print('dh_min, dh_max ...') # _, dh_min, dh_max, _ = redenth_act(compstr) # d['pars']['dh_min'] = clean_value(dh_min, max_dgts=4) # d['pars']['dh_max'] = clean_value(dh_max, max_dgts=4) # d['pars']['elastic'] = RecursiveDict() # print('debye temps ...') # d['pars']['elastic']['debye_temp'] = RecursiveDict() # try: # t_d_perov = get_debye_temp(identifier) # t_d_brownm = get_debye_temp(hdata['data']['reduced_phase']['closest-MP']) # tensors_available = 'True' # except TypeError: # t_d_perov = get_debye_temp("mp-510624") # t_d_brownm = get_debye_temp("mp-561589") # tensors_available = 'False' # d['pars']['elastic']['debye_temp']['perovskite'] = clean_value(t_d_perov, max_dgts=6) # d['pars']['elastic']['debye_temp']['brownmillerite'] = clean_value(t_d_brownm, max_dgts=6) # d['pars']['elastic']['tensors_available'] = tensors_available d["pars"]["last_updated"] = str(datetime.now()) mpfile_single.add_hierarchical_data(d, identifier=identifier) # for process in processes: # if process != "AS": # t_ox_l = t_ox_ws_cs # t_red_l = t_red_ws_cs # p_ox_l = p_ox_ws_cs # p_red_l = p_red_ws_cs # data_source = ["Theo"] # else: # t_ox_l = t_ox_airsep # t_red_l = t_red_airsep # p_ox_l = p_ox_airsep # p_red_l = p_red_airsep # data_source = ["Theo", "Exp"] # for red_temp in t_red_l: # for ox_temp in t_ox_l: # for ox_pr in p_ox_l: # for red_pr in p_red_l: # for data_sources in data_source: # db_id = process + "_" + str(float(ox_temp)) + "_" \ # + str(float(red_temp)) + "_" + str(float(ox_pr)) \ # + "_" + str(float(red_pr)) + "_" + data_sources + \ # "_" + str(float(enth_steps)) print("add energy analysis ...") group = frame.query('compstr.str.contains("{}")'.format(compstr[:-1])) group.drop(labels="compstr", axis=1, inplace=True) for prodstr, subgroup in group.groupby(["prodstr", "prodstr_alt"], sort=False): subgroup.drop(labels=["prodstr", "prodstr_alt"], axis=1, inplace=True) for unstable, subsubgroup in subgroup.groupby("unstable", sort=False): subsubgroup.drop(labels="unstable", axis=1, inplace=True) name = "energy-analysis_{}_{}".format( "unstable" if unstable else "stable", "-".join(prodstr)) print(name) mpfile_single.add_data_table(identifier, subsubgroup, name) print(mpfile_single) mpfile.concat(mpfile_single) break if not row.empty: print("add ΔH ...") exp_thermo = GetExpThermo(sample_number, plotting=False) enthalpy = exp_thermo.exp_dh() table = get_table(enthalpy, "H") mpfile_single.add_data_table(identifier, table, name="enthalpy") print("add ΔS ...") entropy = exp_thermo.exp_ds() table = get_table(entropy, "S") mpfile_single.add_data_table(identifier, table, name="entropy") print("add raw data ...") tga_results = os.path.join( os.path.dirname(solar_perovskite.__file__), "tga_results") for path in glob( os.path.join( tga_results, "ExpDat_JV_P_{}_*.csv".format(sample_number))): print( path.split("_{}_".format(sample_number))[-1].split(".")[0], "...") body = open(path, "r").read() cols = ["Time [min]", "Temperature [C]", "dm [%]", "pO2"] table = read_csv(body, lineterminator=os.linesep, usecols=cols, skiprows=5) table = table[cols].iloc[::100, :] # scale/shift for better graphs T, dm, p = [pd.to_numeric(table[col]) for col in cols[1:]] T_min, T_max, dm_min, dm_max, p_max = ( T.min(), T.max(), dm.min(), dm.max(), p.max(), ) rT, rdm = abs(T_max - T_min), abs(dm_max - dm_min) table[cols[2]] = (dm - dm_min) * rT / rdm table[cols[3]] = p * rT / p_max table.rename( columns={ "dm [%]": "(dm [%] + {:.4g}) * {:.4g}".format(-dm_min, rT / rdm), "pO2": "pO₂ * {:.4g}".format(rT / p_max), }, inplace=True, ) mpfile_single.add_data_table(identifier, table, name="raw")
def pop_first_section(self): item = self.document.popitem(last=False) return self.from_dict(RecursiveDict([item]))
def run(mpfile, **kwargs): # extract data from json files input_dir = mpfile.hdata.general['input_dir'] for idx, obj in enumerate(scandir(input_dir)): mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1] print(mpid) input_file = gzip.open(obj.path, 'rb') try: data = json.loads(input_file.read()) # filter out metals if 'GGA' not in data or 'GGA' not in data[ 'gap'] or data['gap']['GGA'] < 0.1: print('GGA gap < 0.1 -> skip') continue # add hierarchical data (nested key-values) hdata = RecursiveDict() T, lvl, S2 = '300', '1e+18', None pf_key = 'S²σ' hdata['temperature'] = T + ' K' hdata['doping_level'] = lvl + ' cm⁻³' variables = [ { 'key': 'cond_eff_mass', 'name': 'mₑᶜᵒⁿᵈ', 'unit': 'mₑ' }, { 'key': 'seebeck_doping', 'name': 'S', 'unit': 'μV/K' }, { 'key': 'cond_doping', 'name': 'σ', 'unit': '(Ωms)⁻¹' }, ] eigs_keys = ['ε₁', 'ε₂', 'ε₃', '<ε>'] for v in variables: hdata[v['name']] = RecursiveDict() for doping_type in ['p', 'n']: if doping_type in data['GGA'][v['key']]: d = data['GGA'][v['key']][doping_type][T][lvl] eigs = map(float, d if isinstance(d, list) else d['eigs']) hdata[v['name']][doping_type] = RecursiveDict( (eigs_keys[neig], clean_value(eig, v['unit'])) for neig, eig in enumerate(eigs)) hdata[v['name']][doping_type][ eigs_keys[-1]] = clean_value( np.mean(eigs), v['unit']) if v['key'] == 'seebeck_doping': S2 = np.dot(d['tensor'], d['tensor']) elif v['key'] == 'cond_doping': pf = np.mean( np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8 if pf_key not in hdata: hdata[pf_key] = RecursiveDict() hdata[pf_key][doping_type] = { eigs_keys[-1]: clean_value(pf, 'μW/(cmK²s)') } mpfile_data = nest_dict(hdata, ['data']) # build data and max values for seebeck, conductivity and kappa # max/min values computed using numpy. It may be better to code it in pure python. keys = ['pretty_formula', 'volume'] hdata = RecursiveDict((k, data[k]) for k in keys) hdata['volume'] = clean_value(hdata['volume'], 'ų') hdata['bandgap'] = clean_value(data['gap']['GGA'], 'eV') cols = ['value', 'temperature', 'doping'] tables = RecursiveDict() props = RecursiveDict() props['seebeck_doping'] = ['S', 'μV/K'] props['cond_doping'] = ['σ', '(Ωms)⁻¹'] props['kappa_doping'] = ['κₑ', 'W/(mKs)'] for prop_name, (lbl, unit) in props.iteritems(): # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font) # and select it as standard font in your browser (leave other fonts as is, esp. fixed width) tables[lbl] = RecursiveDict() hlbl = lbl + '₋' if len(lbl) > 1 else lbl hlbl += 'ₑₓₜᵣ' hdata[hlbl] = RecursiveDict() for doping_type in ['p', 'n']: prop = data['GGA'][prop_name][doping_type] prop_averages, dopings, columns = [], None, ['T [K]'] temps = sorted(map(int, prop.keys())) for temp in temps: row = [temp] if dopings is None: dopings = sorted(map(float, prop[str(temp)].keys())) for doping in dopings: doping_str = '%.0e' % doping if len(columns) <= len(dopings): columns.append('{} cm⁻³ [{}]'.format( doping_str, unit)) eigs = prop[str(temp)][doping_str]['eigs'] row.append(np.mean(eigs)) prop_averages.append((temp, row)) tables[lbl][doping_type] = Table.from_items( prop_averages, orient='index', columns=columns) arr_prop_avg = np.array( [item[1] for item in prop_averages])[:, 1:] max_v = np.max(arr_prop_avg) if prop_name[0] == 's' and doping_type == 'n': max_v = np.min(arr_prop_avg) if prop_name[0] == 'k': max_v = np.min(arr_prop_avg) arg_max = np.argwhere(arr_prop_avg == max_v)[0] vals = [ clean_value(max_v, unit), clean_value(temps[arg_max[0]], 'K'), clean_value(dopings[arg_max[1]], 'cm⁻³') ] hdata[hlbl][doping_type] = RecursiveDict( (k, v) for k, v in zip(cols, vals)) mpfile_data.rec_update(nest_dict(hdata, ['extra_data'])) mpfile.add_hierarchical_data(mpfile_data, identifier=data['mp_id']) for lbl, dct in tables.iteritems(): for doping_type, table in dct.iteritems(): mpfile.add_data_table(data['mp_id'], table, name='{}({})'.format( lbl, doping_type)) finally: input_file.close()
def from_dict(cls, data=RecursiveDict()): return cls(data=data)
def from_string(data): # use archieml-python parse to import data rdct = RecursiveDict(archieml.loads(data)) rdct.rec_update() # post-process internal representation of file contents for key in rdct.keys(): is_general, root_key = normalize_root_level(key) if is_general: # make part of shared (meta-)data, i.e. nest under `general` at # the beginning of the MPFile if mp_level01_titles[0] not in rdct: rdct.insert_before(rdct.keys()[0], (mp_level01_titles[0], RecursiveDict())) rdct.rec_update( nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key])) else: # normalize identifier key (pop & insert) # using rec_update since we're looping over all entries # also: support data in bare tables (marked-up only by # root-level identifier) by nesting under 'data' value = rdct.pop(key) keys = [root_key] if isinstance(value, list): keys.append('table') rdct.rec_update(nest_dict(value, keys)) # Note: CSV section is marked with 'data ' prefix during iterate() for k, v in rdct[root_key].iterate(): if isinstance(k, six.string_types) and \ k.startswith(mp_level01_titles[1]): # k = table name (incl. data prefix) # v = csv string from ArchieML free-form arrays table_name = k[len(mp_level01_titles[1] + '_'):] pd_obj = read_csv(v) rdct[root_key].pop(table_name) rdct[root_key].rec_update( nest_dict(pd_obj.to_dict(), [k])) rdct[root_key].insert_default_plot_options(pd_obj, k) # convert CIF strings into pymatgen structures if mp_level01_titles[3] in rdct[root_key]: from pymatgen.io.cif import CifParser for name in rdct[root_key][mp_level01_titles[3]].keys(): cif = rdct[root_key][mp_level01_titles[3]].pop(name) parser = CifParser.from_string(cif) structure = parser.get_structures(primitive=False)[0] rdct[root_key][mp_level01_titles[3]].rec_update( nest_dict(structure.as_dict(), [name])) return MPFile.from_dict(rdct)
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict(), RecursiveDict() input_urls = mpfile.document['_hdata'].pop('input_urls') for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) if not os.path.exists(dbfile): print 'downloading', dbfile, '...' urllib.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'parent_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'eV')), ('op_gap', ('ΔE|optB88vdW', 'eV')), ('mbj_gap', ('ΔE|mbj', 'eV')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print len(input_data[project]), 'materials loaded for', project projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in identifiers: data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula ) data[project]['id'] = input_urls[project]['detail'].format(d[input_keys[project][0]]) Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) for project, structure in structures.items(): name = '{}_{}'.format(data['formula'], project) try: mpfile.add_structure(structure, name=name, identifier=identifier) except Exception as ex: print str(ex)
def run(mpfile, **kwargs): from pymatgen import Structure reference_project = None input_data, input_keys, extra = RecursiveDict(), RecursiveDict( ), RecursiveDict() input_urls = mpfile.document['_hdata'].pop('input_urls') for project in input_urls: input_url = input_urls[project]['file'] if '{}' in input_url: input_url = input_url.format('2d') # TODO 3d for Jarvis dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1]) if not os.path.exists(dbfile): print 'downloading', dbfile, '...' urllib.urlretrieve(input_url, dbfile) ext = os.path.splitext(dbfile)[1] is_nus = bool(ext == '.gz') id_key = 'parent_id' if is_nus else 'mpid' if not is_nus: with tarfile.open(dbfile, "r:gz") as tar: member = tar.getmembers()[0] raw_data = json.load(tar.extractfile(member), cls=MontyDecoder) else: reference_project = project raw_data = [] with gzip.open(dbfile, 'rb') as f: for line in f: raw_data.append(json.loads(line, cls=MontyDecoder)) input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data) input_keys[project] = [ 'material_id', 'exfoliation_energy_per_atom', 'structure' ] if is_nus else ['jid', 'exfoliation_en', 'final_str'] extra[project] = [ ('fin_en', ('E', 'meV/atom')), ('op_gap', ('ΔE|optB88vdW', 'meV/atom')), ('mbj_gap', ('ΔE|mbj', 'meV/atom')), #('kv', ('Kᵥ', 'GPa')), #('gv', ('Gᵥ', 'GPa')) ] if not is_nus else [] print len(input_data[project]), 'materials loaded for', project projects = input_data.keys() identifiers = [] for d in input_data.values(): identifiers += list(d.keys()) for identifier in identifiers: data, structures = RecursiveDict(), RecursiveDict() for project in projects: if project not in data: data[project] = RecursiveDict() if identifier in input_data[project]: d = input_data[project][identifier] structures[project] = d[input_keys[project][-1]] if data.get('formula') is None: data['formula'] = get_composition_from_string( structures[project].composition.reduced_formula) data[project]['id'] = input_urls[project]['detail'].format( d[input_keys[project][0]]) Ex = d[input_keys[project][1]] if project == reference_project: Ex *= 1000. data[project]['Eₓ'] = clean_value(Ex, 'eV') for k, (sym, unit) in extra[project]: if d[k] != 'na': data[project][sym] = clean_value(d[k], unit) mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier) for project, structure in structures.items(): name = '{}_{}'.format(data['formula'], project) try: mpfile.add_structure(structure, name=name, identifier=identifier) except Exception as ex: print str(ex)