def get_contributions(self): projection = {'_id': 1, 'mp_cat_id': 1, 'content': 1} docs = self.query_contributions(projection=projection) if not docs: raise Exception('No contributions found for ALS Beamline Explorer!') data = [] columns = ['formula', 'cid'] keys = RecursiveDict([ ('composition', ['Co', 'Cu', 'Ce']), #('position', ['x', 'y']), ('XAS', ['min', 'max']), ('XMCD', ['min', 'max']) ]) columns += ['##'.join([k, sk]) for k, subkeys in keys.items() for sk in subkeys] for doc in docs: mpfile = MPFile.from_contribution(doc) identifier = mpfile.ids[0] contrib = mpfile.hdata[identifier]['data'] cid_url = self.get_cid_url(doc) row = [identifier, cid_url] row += [contrib[k][sk] for k, subkeys in keys.items() for sk in subkeys] data.append((identifier, row)) return Table.from_items(data, orient='index', columns=columns)
def get_contributions(self): projection = {'_id': 1, 'identifier': 1, 'content': 1} docs = self.query_contributions(projection=projection) if not docs: raise Exception('No contributions found for ALS Beamline Explorer!') data = [] columns = ['formula', 'cid'] keys = RecursiveDict([ ('composition', ['Co', 'Cu', 'Ce']), #('position', ['x', 'y']), ('XAS', ['min', 'max']), ('XMCD', ['min', 'max']) ]) columns += ['##'.join([k, sk]) for k, subkeys in keys.items() for sk in subkeys] for doc in docs: mpfile = MPFile.from_contribution(doc) identifier = mpfile.ids[0] contrib = mpfile.hdata[identifier]['data'] cid_url = self.get_cid_url(doc) row = [identifier, cid_url] row += [contrib[k][sk] for k, subkeys in keys.items() for sk in subkeys] data.append((identifier, row)) return Table.from_items(data, orient='index', columns=columns)
def to_backgrid_dict(self): """Backgrid-conform dict from DataFrame""" # shorten global import times by importing django here import numpy as np from mpcontribs.io.core.utils import get_composition_from_string from pandas import MultiIndex import pymatgen.util as pmg_util from pymatgen.core.composition import CompositionError table = dict() nrows_max = 260 nrows = self.shape[0] df = Table(self.head(n=nrows_max)) if nrows > nrows_max else self numeric_columns = df.select_dtypes( include=[np.number]).columns.tolist() if isinstance(df.index, MultiIndex): df.reset_index(inplace=True) table['columns'] = [] table['rows'] = super(Table, df).to_dict(orient='records') for col_index, col in enumerate(list(df.columns)): cell_type = 'number' # avoid looping rows to minimize use of `df.iat` (time-consuming in 3d) if not col.startswith('level_') and col not in numeric_columns: is_url_column, prev_unit, old_col = True, None, col for row_index in range(df.shape[0]): cell = str(df.iat[row_index, col_index]) cell_split = cell.split(' ', 1) if not cell or len( cell_split) == 1: # empty cell or no space is_url_column = bool( is_url_column and (not cell or mp_id_pattern.match(cell))) if is_url_column: if cell: value = 'https://materialsproject.org/materials/{}'.format( cell) table['rows'][row_index][col] = value elif cell: try: composition = get_composition_from_string(cell) composition = pmg_util.string.unicodeify( composition) table['rows'][row_index][col] = composition except (CompositionError, ValueError, OverflowError): try: # https://stackoverflow.com/a/38020041 result = urlparse(cell) if not all([ result.scheme, result.netloc, result.path ]): break is_url_column = True except: break else: value, unit = cell_split # TODO convert cell_split[0] to float? is_url_column = False try: float(value ) # unit is only a unit if value is number except ValueError: continue table['rows'][row_index].pop(old_col) if prev_unit is None: prev_unit = unit col = '{} [{}]'.format(col, unit) table['rows'][row_index][ col] = cell if prev_unit != unit else value cell_type = 'uri' if is_url_column else 'string' col_split = col.split('##') nesting = [col_split[0]] if len(col_split) > 1 else [] table['columns'].append({ 'name': col, 'cell': cell_type, 'nesting': nesting, 'editable': 0 }) if len(col_split) > 1: table['columns'][-1].update( {'label': '##'.join(col_split[1:])}) if len(table['columns']) > 12: table['columns'][-1]['renderable'] = 0 header = RecursiveDict() for idx, col in enumerate(table['columns']): if 'label' in col: k, sk = col['name'].split('##') sk_split = sk.split() if len(sk_split) == 2: d = {'name': sk_split[0], 'unit': sk_split[1], 'idx': idx} if k not in header: header[k] = [d] else: header[k].append(d) elif k in header: header.pop(k) for k, skl in header.items(): units = [sk['unit'] for sk in skl] if units.count(units[0]) == len(units): for sk in skl: table['columns'][sk['idx']]['label'] = sk['name'] table['columns'][sk['idx']]['nesting'][0] = '{} {}'.format( k, sk['unit']) return table
def run(mpfile, **kwargs): indir = "/Users/patrick/Downloads/ThinFilmPV" summary_data = json.load(open(os.path.join(indir, "SUMMARY.json"), "r")) absorption_data = json.load( open(os.path.join(indir, "ABSORPTION-CLIPPED.json"), "r")) dos_data = json.load(open(os.path.join(indir, "DOS.json"), "r")) formulae_data = json.load( open(os.path.join(indir, "FORMATTED-FORMULAE.json"), "r")) config = RecursiveDict([ ("SLME_500_nm", ["SLME|500nm", "%"]), ("SLME_1000_nm", ["SLME|1000nm", "%"]), ("E_g", ["ΔE.corrected", "eV"]), ("E_g_d", ["ΔE.direct", "eV"]), ("E_g_da", ["ΔE.dipole-allowed", "eV"]), ("m_e", ["mᵉ", "mₑ"]), ("m_h", ["mʰ", "mₑ"]), ]) print(len(summary_data.keys())) for mp_id, d in summary_data.items(): print(mp_id) formula = formulae_data[mp_id].replace("<sub>", "").replace("</sub>", "") query = {"identifier": mp_id, "project": "screening_inorganic_pv"} # r = db.contributions.update_one(query, {'$set': {'content.data.formula': formula}}) # print(r.modified_count) # continue rd = RecursiveDict({"formula": formula}) for k, v in config.items(): value = clean_value(d[k], v[1], max_dgts=4) if not "." in v[0]: rd[v[0]] = value else: keys = v[0].split(".") if not keys[0] in rd: rd[keys[0]] = RecursiveDict({keys[1]: value}) else: rd[keys[0]][keys[1]] = value mpfile.add_hierarchical_data({"data": rd}, identifier=mp_id) doc = query.copy() doc["content.data"] = mpfile.document[mp_id]["data"] doc["collaborators"] = [{ "name": "Patrick Huck", "email": "*****@*****.**" }] # r = db.contributions.update_one(query, {'$set': doc}, upsert=True) # cid = r.upserted_id cid = db.contributions.find_one(query, {"_id": 1})["_id"] df = DataFrame(data=absorption_data[mp_id]) df.columns = ["hν [eV]", "α [cm⁻¹]"] mpfile.add_data_table(mp_id, df, "absorption") table = mpfile.document[mp_id]["absorption"] table.pop("@module") table.pop("@class") table["identifier"] = mp_id table["project"] = "screening_inorganic_pv" table["name"] = "absorption" table["cid"] = cid # r = db.tables.insert_one(table) # tids = [r.inserted_id] r = db.tables.update_one( { "identifier": mp_id, "project": "screening_inorganic_pv", "name": "absorption", "cid": cid, }, {"$set": table}, ) print(len(table["data"]), r.modified_count)