Пример #1
0
    def get_contributions(self):
        projection = {'_id': 1, 'mp_cat_id': 1, 'content': 1}
        docs = self.query_contributions(projection=projection)
        if not docs:
            raise Exception('No contributions found for ALS Beamline Explorer!')

        data = []
        columns = ['formula', 'cid']
        keys = RecursiveDict([
            ('composition', ['Co', 'Cu', 'Ce']),
            #('position', ['x', 'y']),
            ('XAS', ['min', 'max']),
            ('XMCD', ['min', 'max'])
        ])
        columns += ['##'.join([k, sk]) for k, subkeys in keys.items() for sk in subkeys]

        for doc in docs:
            mpfile = MPFile.from_contribution(doc)
            identifier = mpfile.ids[0]
            contrib = mpfile.hdata[identifier]['data']
            cid_url = self.get_cid_url(doc)
            row = [identifier, cid_url]
            row += [contrib[k][sk] for k, subkeys in keys.items() for sk in subkeys]
            data.append((identifier, row))
        return Table.from_items(data, orient='index', columns=columns)
Пример #2
0
def run(mpfile, **kwargs):

    input_file = mpfile.document['_hdata'].pop('input_file')
    zip_path = os.path.join(os.environ['HOME'], 'work', input_file)
    if not os.path.exists(zip_path):
        return 'Please upload', zip_path
    zip_file = ZipFile(zip_path, 'r')

    composition_table_dict = mpfile.document['_hdata']['composition_table']
    conc_funcs = get_concentration_functions(composition_table_dict)

    for info in zip_file.infolist():
        print info.filename
        d = RecursiveDict()

        # positions.x/y from filename, <scan-id>_<meas-element>_<X>_<Y>.csv
        element, x, y = os.path.splitext(info.filename)[0].rsplit('_', 4)
        d['position'] = RecursiveDict(
            (k, clean_value(v, 'mm'))
            for k, v in zip(['x', 'y'], [x, y])
        )

        # composition
        d['composition'] = RecursiveDict(
            (el, clean_value(f(x, y), convert_to_percent=True))
            for el, f in conc_funcs.items()
        )

        # identifier
        identifier = get_composition_from_string(''.join([
            '{}{}'.format(el, int(round(Decimal(comp.split()[0]))))
            for el, comp in d['composition'].items()
        ]))

        # load csv file
        try:
            csv = zip_file.read(info.filename)
        except KeyError:
            print 'ERROR: Did not find %s in zip file' % info.filename

        # read csv to pandas DataFrame and add to MPFile
        df = read_csv(csv)
        df = df[['Energy', 'XAS', 'XMCD']]

        # min and max
        d.rec_update(RecursiveDict(
            (y, RecursiveDict([
                ('min', df[y].min()), ('max', df[y].max())
            ])) for y in ['XAS', 'XMCD']
        ))

        # add data to MPFile
        mpfile.add_hierarchical_data(nest_dict(d, ['data']), identifier=identifier)
        mpfile.add_data_table(identifier, df, name=element)
Пример #3
0
 def organize_player_info(self):
     """organize player info into nested dict"""
     splits = map(self._split_string_at_caps, self.player.index)
     counter = Counter([ el[0] for el in splits if el ])
     subsecs = [key for key,cnt in counter.iteritems() if cnt > 1]
     self.player_info = RecursiveDict({})
     for k,v in self.player.iteritems():
         keys = self._split_string_at_caps(k)
         nested = {keys[0]: {keys[1]: v}} if (
             keys and keys[0] in subsecs
         ) else {'other': {k: v}}
         self.player_info.rec_update(nested)
Пример #4
0
def run(mpfile):

    google_sheet = "https://docs.google.com/spreadsheets/d/1Wep4LZjehrxu3Cl5KJFvAAhKhP92o4K5aC-kZYjGz2o/export?format=xlsx"
    contcars_filepath = "bulk_CONTCARs.tar.gz"
    contcars = tarfile.open(contcars_filepath)

    df = read_excel(google_sheet)
    keys = df.iloc[[0]].to_dict(orient="records")[0]
    abbreviations = RecursiveDict()

    count, skipped, update = 0, 0, 0
    for index, row in df[1:].iterrows():
        identifier = None
        data = RecursiveDict()

        for col, value in row.iteritems():
            if col == "level_0" or col == "index":
                continue
            key = keys[col]
            if isinstance(key, str):
                key = key.strip()
                if not key in abbreviations:
                    abbreviations[key] = col
            else:
                key = col.strip().lower()

            if key == "pmgmatchid":
                identifier = value.strip()
                if identifier == "None":
                    identifier = None
                name = "_".join(data["directory"].split("/")[1:])
                contcar_path = "bulk_CONTCARs/{}_CONTCAR".format(
                    data["directory"].replace("/", "_"))
                contcar = contcars.extractfile(contcar_path)
                try:
                    if identifier == "mp-34710":
                        identifier = "mp-24878"
                    identifier_match = mpfile.add_structure(
                        contcar.read().decode("utf8"),
                        fmt="poscar",
                        name=name,
                        identifier=identifier,
                    )
                except Exception as ex:
                    print(ex)
                    continue
                if not identifier:
                    identifier = identifier_match
            else:
                if isinstance(value, str):
                    val = value.strip()
                else:
                    unit = units.get(key, "")
                    val = clean_value(value, unit=unit)
                if val != "None":
                    data[key] = val

        mpfile.add_hierarchical_data({"data": data}, identifier=identifier)
        doc = {"identifier": identifier, "project": project, "content": {}}
        doc["content"]["data"] = mpfile.document[identifier]["data"]
        doc["collaborators"] = [{
            "name": "Patrick Huck",
            "email": "*****@*****.**"
        }]
        r = db.contributions.insert_one(doc)
        cid = r.inserted_id
        print("cid:", cid)

        sdct = mpfile.document[identifier]["structures"][name]
        sdct.pop("@module")
        sdct.pop("@class")
        if sdct["charge"] is None:
            sdct.pop("charge")
        sdct["identifier"] = identifier
        sdct["project"] = project
        sdct["name"] = name
        sdct["cid"] = cid
        r = db.structures.insert_one(sdct)
        print("sid:", r.inserted_id)

        r = db.contributions.update_one(
            {"_id": cid}, {"$set": {
                "content.structures": [r.inserted_id]
            }})
        print(r.matched_count, r.modified_count)
Пример #5
0
 def from_contribution(cls, contrib):
     """construct MPFile from contribution (see rest.adapter.submit_contribution)"""
     if "identifier" not in contrib or "content" not in contrib:
         raise ValueError("Dict not in contribution-style format")
     recdict = RecursiveDict({contrib["identifier"]: contrib["content"]})
     return cls.from_dict(recdict)
Пример #6
0
 def from_contribution(cls, contrib):
     """construct MPFile from contribution (see rest.adapter.submit_contribution)"""
     if not 'identifier' in contrib or not 'content' in contrib:
         raise ValueError('Dict not in contribution-style format')
     recdict = RecursiveDict({contrib['identifier']: contrib['content']})
     return cls.from_dict(recdict)
Пример #7
0
 def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]):
     if len(self.ids) >= self.max_contribs:
         raise StopIteration('Reached max. number of contributions in MPFile')
     self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier]))
Пример #8
0
class MPFileCore(six.with_metaclass(ABCMeta, object)):
    """Abstract Base Class for representing a MP Contribution File"""
    def __init__(self, data=RecursiveDict()):
        if isinstance(data, dict):
            self.document = RecursiveDict(data)
        else:
            raise ValueError('Need dict (or inherited class) to init MPFile.')
        self.document.rec_update() # convert (most) OrderedDict's to RecursiveDict's
        self.unique_mp_cat_ids = True
        self.max_contribs = 10

    def __getitem__(self, key):
        item = self.from_dict({key: self.document[key]})
        general = self.document.get(mp_level01_titles[0])
        if general:
            item.insert_general_section(self.from_dict({mp_level01_titles[0]: general}))
        return item

    @property
    def ids(self):
        return [
            k for k in self.document.keys()
            if k.lower() != mp_level01_titles[0]
        ]

    @property
    def hdata(self):
        return HierarchicalData(self.document)

    @property
    def tdata(self):
        return TabularData(self.document)

    @property
    def gdata(self):
        return GraphicalData(self.document)

    @property
    def sdata(self):
        return StructuralData(self.document)

    @classmethod
    def from_file(cls, filename_or_file=default_mpfile_path.replace('.txt', '_in.txt')):
        """Reads a MPFile from a file.

        Args:
            filename_or_file (str or file): name of file or file containing contribution data.

        Returns:
            MPFile object.
        """
        f = open(filename_or_file) \
            if isinstance(filename_or_file, six.string_types) \
            else filename_or_file
        return cls.from_string(f.read())

    @classmethod
    def from_dict(cls, data=RecursiveDict()):
        return cls(data=data)

    @classmethod
    def from_contribution(cls, contrib):
        """construct MPFile from contribution (see rest.adapter.submit_contribution)"""
        if not 'identifier' in contrib or not 'content' in contrib:
            raise ValueError('Dict not in contribution-style format')
        recdict = RecursiveDict({contrib['identifier']: contrib['content']})
        return cls.from_dict(recdict)

    def write_file(self, filename=default_mpfile_path.replace('.txt', '_out.txt'), **kwargs):
        """Writes MPFile to a file. The supported kwargs are the same as those
        for the MPFile.get_string method and are passed through directly."""
        with codecs.open(filename, encoding='utf-8', mode='w') as f:
            file_str = self.get_string(**kwargs) + '\n'
            f.write(file_str)
            print('{} ({:.3f}MB) written'.format(
                filename, os.path.getsize(filename) / 1024. / 1024.
            ))

    def get_number_of_lines(self, **kwargs):
        return len(self.get_string(**kwargs).split('\n'))

    def split(self):
        general_mpfile = self.pop_first_section() \
                if mp_level01_titles[0] in self.document.keys() else None
        if not self.document:
            raise ValueError('No contributions in MPFile! Either the file is'
                             ' empty or only contains shared (meta-)data not'
                             ' correlated to core identifier.')
        while True:
            try:
                mpfile_single = self.pop_first_section()
                mpid_orig = mpfile_single.ids[0]
                if '--' in mpid_orig:
                    mpid = mpid_orig.split('--')[0]
                    mpfile_single.document.rec_update(nest_dict(
                        mpfile_single.document.pop(mpid_orig), [mpid]
                    ))
                if general_mpfile is not None:
                    mpfile_single.insert_general_section(general_mpfile)
                yield mpfile_single
            except KeyError:
                break

    def get_identifiers(self):
        """list of materials/composition identifiers as tuples w/ contribution IDs"""
        return [
            (k, self.document[k].get('cid', None))
            for k in self.document
            if k.lower() != mp_level01_titles[0]
        ]

    def pop_first_section(self):
        item = self.document.popitem(last=False)
        return self.from_dict(RecursiveDict([item]))

    def insert_general_section(self, general_mpfile):
        """insert general section from `general_mpfile` into this MPFile"""
        if general_mpfile is None:
            return
        general_title = mp_level01_titles[0]
        general_data = general_mpfile.document[general_title]
        root_key = list(self.document.keys())[0]
        for key, value in general_data.items():
            if key in self.document[root_key]:
                self.document.rec_update(nest_dict(value, [root_key, key]))
            else:
                self.document[root_key][key] = value
        for key in reversed(general_data.keys()):
            self.document[root_key].move_to_end(key, last=False)

    def get_unique_mp_cat_id(self, mp_cat_id):
        if not self.unique_mp_cat_ids or mp_cat_id in mp_level01_titles:
            return mp_cat_id
        mp_cat_id_idx = len([i for i in self.ids if i.startswith(mp_cat_id)])
        if mp_cat_id_idx == 0:
            return mp_cat_id
        return '{}--{}'.format(mp_cat_id, mp_cat_id_idx)

    def concat(self, mpfile):
        """concatenate single-section MPFile with this MPFile"""
        try:
            if len(mpfile.document) > 1:
                raise ValueError('concatenation only possible with single section files')
        except AttributeError:
            raise ValueError('Provide a MPFile to concatenate')
        mp_cat_id = list(mpfile.document.keys())[0]
        general_title = mp_level01_titles[0]
        if general_title in mpfile.document[mp_cat_id]:
            general_data = mpfile.document[mp_cat_id].pop(general_title)
            if general_title not in self.document:
                self.document.rec_update(nest_dict(general_data, [general_title]))
        self.document.rec_update(nest_dict(
            mpfile.document.pop(mp_cat_id),
            [self.get_unique_mp_cat_id(mp_cat_id)]
        ))

    def insert_top(self, mp_cat_id, key, value):
        """insert value for `mp_cat_id` as `key: <value>` at top"""
        self.document[mp_cat_id][key] = str(value)
        self.document[mp_cat_id].move_to_end(key, last=False)

    def add_data_table(self, identifier, dataframe, name, plot_options=None):
        """add a datatable to the root-level section

        Args:
            identifier (str): MP category ID (`mp_cat_id`)
            dataframe (pandas.DataFrame): tabular data as Pandas DataFrame
            name (str): table name, optional if only one table in section
            plot_options (dict): options for according plotly graph
        """
        # TODO: optional table name, required if multiple tables per root-level section
        name = ''.join([replacements.get(c, c) for c in name])
        self.document.rec_update(nest_dict(
            Table(dataframe).to_dict(), [identifier, name]
        ))
        self.document[identifier].insert_default_plot_options(
            dataframe, name, update_plot_options=plot_options
        )

    def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]):
        if len(self.ids) >= self.max_contribs:
            raise StopIteration('Reached max. number of contributions in MPFile')
        self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier]))

    def add_structure(self, source, name=None, identifier=None, fmt=None):
        """add a structure to the mpfile"""
        from pymatgen import Structure, MPRester
        if isinstance(source, Structure):
            structure = source
        elif isinstance(source, dict):
            structure = Structure.from_dict(source)
        elif os.path.exists(source):
            structure = Structure.from_file(source, sort=True)
        elif isinstance(source, six.string_types):
            if fmt is None:
                raise ValueError('Need fmt to get structure from string!')
            structure = Structure.from_str(source, fmt, sort=True)
        else:
            raise ValueError(source, 'not supported!')

        if name is not None:
            if not isinstance(name, six.string_types):
                raise ValueError('structure name needs to be a string')
            elif '.' in name:
                raise ValueError('structure name cannot contain dots (.)')

        mpr = MPRester()
        if not mpr.api_key:
            raise ValueError(
                'API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`.'
            )
        matched_mpids = mpr.find_structure(structure)
        formula = get_composition_from_string(structure.composition.formula)
        if not matched_mpids:
            if identifier is None:
                identifier = formula
                print(
                    'Structure not found in MP! Please submit via MPComplete to '
                    'obtain mp-id or manually choose an anchor mp-id! Continuing '
                    'with {} as identifier!'.format(identifier)
                )
            else:
                print('Structure not found in MP! Forcing {} as identifier!'.format(identifier))
        elif identifier is None:
            identifier = matched_mpids[0]
            if len(matched_mpids) > 1:
                print('Multiple matching structures found in MP. Using', identifier)
        elif identifier not in matched_mpids:
            msg = 'Structure does not match {} but instead {}!'.format(identifier, matched_mpids)
            raise ValueError(msg)

        idx = len(self.document.get(identifier, {}).get(mp_level01_titles[3], {}))
        sub_key = formula if name is None else name
        if sub_key in self.document.get(identifier, {}).get(mp_level01_titles[3], {}):
            sub_key += '_{}'.format(idx)
        self.document.rec_update(nest_dict(
            structure.as_dict(), [identifier, mp_level01_titles[3], sub_key]
        ))
        return identifier

    def __repr__(self): return self.get_string(df_head_only=True)
    def __str__(self):
        return self.get_string(df_head_only=True)

    def _ipython_display_(self):
        from IPython.display import display_html
        display_html(self.hdata)
        display_html(self.tdata)
        display_html(self.gdata)
        display_html(self.sdata)

    # ----------------------------
    # Override these in subclasses
    # ----------------------------

    @staticmethod
    def from_string(data):
        """Reads a MPFile from a string containing contribution data."""
        return MPFileCore()

    def get_string(self, df_head_only=False):
        """Returns a string to be written as a file"""
        return repr(self.document)
Пример #9
0
def run(mpfile, include_cifs=True, nmax=None, dup_check_test_site=True):

    data_input = mpfile.document[mp_level01_titles[0]].pop('input')
    phase_names = mpfile.hdata.general['info']['phase_names']
    dir_path = os.path.dirname(os.path.realpath(__file__))
    for k in data_input.keys():
        data_input[k] = os.path.join(dir_path, data_input[k])

    doi = mpfile.hdata.general['doi']
    existing_mpids = {}
    for b in [False, True]:
        with MnO2PhaseSelectionRester(test_site=b) as mpr:
            for doc in mpr.query_contributions(criteria={'content.doi': doi}):
                existing_mpids[doc['mp_cat_id']] = doc['_id']
        if not dup_check_test_site:
            break

    with open(data_input['formatted_entries'], "r") as fin:
        mp_contrib_phases = json.loads(fin.read())
    with open(data_input['hull_entries'], "r") as fin:
        hull_states = json.loads(fin.read())
    with open(data_input['mpid_existing'], 'r') as fin:
        mp_dup = json.loads(fin.read())
    with open(data_input['mpid_new'], 'r') as fin:
        mp_cmp = json.loads(fin.read())

    ################################################################################################################
    # add unique structures first (special cases)
    ################################################################################################################

    if include_cifs:
        for hstate in hull_states:
            if 'other' == hstate['phase']:
                c = Composition.from_dict(hstate['c'])
                s = Structure.from_dict(hstate['s'])
                for mpid in mpfile.ids:
                    formula = mpfile.hdata[mpid]['data']['Formula']
                    if c.almost_equals(Composition(formula)):
                        if nmax is not None and mpid in existing_mpids:
                            mpfile.document.pop(mpid)  # skip duplicates
                            break
                        try:
                            mpfile.add_structure(s, identifier=mpid)
                            print formula, 'added to', mpid
                        except Exception as ex:
                            print 'tried to add structure to', mpid, 'but', str(
                                ex)
                        if mpid in existing_mpids:
                            cid = existing_mpids[mpid]
                            mpfile.insert_id(mpid, cid)
                            print cid, 'inserted to update', mpid
                        break

    # "phase": 'postspinel-NaMn2O4', "Formula": 'Na0.5MnO2',
    # "dHf (eV/mol)": -1.415, "dHh (eV/mol)": '--', "Ground state?": 'Y',

    ################################################################################################################
    # Get mp-ids for all entries based on matching the VASP directory path names
    # Paths are different in the existing and new mp-id dictionary, so processing has to be independent
    ################################################################################################################

    print 'get all mp-ids based on VASP directory paths ...'

    for framework, fdat in mp_contrib_phases.items():
        for i, phase in enumerate(fdat):
            c = Composition(phase[0])
            for hstate in hull_states:
                if phase_names[framework] == hstate['phase'] and \
                        c.almost_equals(Composition.from_dict(hstate['c'])) and \
                        len(mp_contrib_phases[framework][i]) < 6:
                    mp_contrib_phases[framework][i].append(hstate['path'])
                    mp_contrib_phases[framework][i].append(hstate['s'])

    for framework, fdat in mp_contrib_phases.items():
        for i, phase in enumerate(fdat):
            match_path = phase[4].replace('all_states/', '')
            mp_ids = []
            for path, ids in mp_dup.items():
                mp_path = path.replace(
                    '/Users/patrick/Downloads/20160710_MPContrib_MnO2_DK/',
                    '').replace('/3.double_relax/CONTCAR', '')
                if match_path == mp_path:
                    mp_ids.extend(ids)
            for path, id_dat in mp_cmp.items():
                mp_path = path.replace(
                    '20160725_MnO2_DK_Cifs/20160710_MPContrib_MnO2_DK-',
                    '').replace('-3.double_relax-CONTCAR.cif',
                                '').replace('-', '/')
                if match_path == mp_path:
                    if 'mp_id' in id_dat.keys():
                        mp_ids.append(id_dat['mp_id'])

            mp_contrib_phases[framework][i].append(mp_ids)

    ################################################################################################################
    # For structures that have mp-ids, add them to the contribution dictionary.
    # For those that don't, run a separate dictionary to keep track of them
    ################################################################################################################

    print 'add structures with mp-ids to contribution ...'

    no_id_dict = {}
    errors_file = os.path.join(os.path.dirname(__file__), 'errors.json')
    with open(errors_file, 'r') as f:
        errors = json.load(f)

    for framework, fdat in mp_contrib_phases.items():
        for phase in fdat:
            d = RecursiveDict()
            d["Phase"] = framework
            d["Formula"] = phase[0]
            try:
                float(phase[1])
                d["dHf"] = '{} eV/mol'.format(phase[1])
            except:
                d["dHf"] = '--'
            try:
                float(phase[3])
                d["dHh"] = '{} eV/mol'.format(phase[3])
            except:
                d["dHh"] = '--'
            d["GS"] = phase[2]
            if len(phase[6]) == 0:
                no_id_dict[phase[4].replace('all_states/', '')] = d
            for mpid in phase[6]:
                if nmax is not None:
                    if len(mpfile.ids) >= nmax - 1:
                        break
                    elif mpid in existing_mpids:
                        continue  # skip duplicates
                mpfile.add_hierarchical_data(RecursiveDict({'data': d}),
                                             identifier=mpid)
                print 'added', mpid
                if mpid in existing_mpids:
                    cid = existing_mpids[mpid]
                    mpfile.insert_id(mpid, cid)
                    print cid, 'inserted to update', mpid
                if include_cifs:
                    try:
                        mpfile.add_structure(phase[5], identifier=mpid)
                        print framework, phase[0], 'added to', mpid
                    except ValueError as ex:
                        print 'tried to add structure to', mpid, 'but', str(ex)
                        errors[mpid] = str(ex)

    with open(errors_file, 'w') as f:
        json.dump(errors, f)

    print """
    DONE.
    {} structures to submit.
    {} structures do not have mp-ids.
    {} structures with mp-ids have errors.
    """.format(len(mpfile.ids), len(no_id_dict), len(errors))
Пример #10
0
def run(mpfile, hosts=None, download=False):
    mpr = MPRester()
    fpath = f"{project}.xlsx"

    if download or not os.path.exists(fpath):

        figshare_id = 1546772
        url = "https://api.figshare.com/v2/articles/{}".format(figshare_id)
        print("get figshare article {}".format(figshare_id))
        r = requests.get(url)
        figshare = json.loads(r.content)
        print("version =",
              figshare["version"])  # TODO set manually in "other"?

        print("read excel from figshare into DataFrame")
        df_dct = None
        for d in figshare["files"]:
            if "xlsx" in d["name"]:
                # Dict of DataFrames is returned, with keys representing sheets
                df_dct = read_excel(d["download_url"], sheet_name=None)
                break
        if df_dct is None:
            print("no excel sheet found on figshare")
            return

        print("save excel to disk")
        writer = ExcelWriter(fpath)
        for sheet, df in df_dct.items():
            df.to_excel(writer, sheet)
        writer.save()

    else:
        df_dct = read_excel(fpath, sheet_name=None)

    print(len(df_dct), "sheets loaded.")

    print("looping hosts ...")
    host_info = df_dct["Host Information"]
    host_info.set_index(host_info.columns[0], inplace=True)
    host_info.dropna(inplace=True)

    for idx, host in enumerate(host_info):
        if hosts is not None:
            if isinstance(hosts, int) and idx + 1 > hosts:
                break
            elif isinstance(hosts, list) and not host in hosts:
                continue

        print("get mp-id for {}".format(host))
        mpid = None
        for doc in mpr.query(criteria={"pretty_formula": host},
                             properties={"task_id": 1}):
            if "decomposes_to" not in doc["sbxd"][0]:
                mpid = doc["task_id"]
                break
        if mpid is None:
            print("mp-id for {} not found".format(host))
            continue

        print("add host info for {}".format(mpid))
        hdata = host_info[host].to_dict(into=RecursiveDict)
        for k in list(hdata.keys()):
            v = hdata.pop(k)
            ks = k.split()
            if ks[0] not in hdata:
                hdata[ks[0]] = RecursiveDict()
            unit = ks[-1][1:-1] if ks[-1].startswith("[") else ""
            subkey = "_".join(ks[1:-1] if unit else ks[1:]).split(",")[0]
            if subkey == "lattice_constant":
                unit = "Å"
            try:
                hdata[ks[0]][subkey] = clean_value(
                    v, unit.replace("angstrom", "Å"))
            except ValueError:
                hdata[ks[0]][subkey] = v
        hdata["formula"] = host
        df = df_dct["{}-X".format(host)]
        rows = list(isnull(df).any(1).nonzero()[0])
        if rows:
            cells = df.iloc[rows].dropna(how="all").dropna(
                axis=1)[df.columns[0]]
            note = cells.iloc[0].replace("following", cells.iloc[1])[:-1]
            hdata["note"] = note
            df.drop(rows, inplace=True)
        mpfile.add_hierarchical_data(nest_dict(hdata, ["data"]),
                                     identifier=mpid)

        print("add table for D₀/Q data for {}".format(mpid))
        df.set_index(df["Solute element number"], inplace=True)
        df.drop("Solute element number", axis=1, inplace=True)
        df.columns = df.iloc[0]
        df.index.name = "index"
        df.drop("Solute element name", inplace=True)
        df = df.T.reset_index()
        if str(host) == "Fe":
            df_D0_Q = df[[
                "Solute element name",
                "Solute D0, paramagnetic [cm^2/s]",
                "Solute Q, paramagnetic [eV]",
            ]]
        elif hdata["Host"]["crystal_structure"] == "HCP":
            df_D0_Q = df[[
                "Solute element name",
                "Solute D0 basal [cm^2/s]",
                "Solute Q basal [eV]",
            ]]
        else:
            df_D0_Q = df[[
                "Solute element name", "Solute D0 [cm^2/s]", "Solute Q [eV]"
            ]]
        df_D0_Q.columns = ["Solute", "D₀ [cm²/s]", "Q [eV]"]
        anums = [z[el] for el in df_D0_Q["Solute"]]
        df_D0_Q.insert(0, "Z", Series(anums, index=df_D0_Q.index))
        df_D0_Q.sort_values("Z", inplace=True)
        df_D0_Q.reset_index(drop=True, inplace=True)
        mpfile.add_data_table(mpid, df_D0_Q, "D₀_Q")

        if hdata["Host"]["crystal_structure"] == "BCC":

            print("add table for hop activation barriers for {} (BCC)".format(
                mpid))
            columns_E = ([
                "Hop activation barrier, E_{} [eV]".format(i)
                for i in range(2, 5)
            ] + [
                "Hop activation barrier, E'_{} [eV]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop activation barrier, E''_{} [eV]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop activation barrier, E_{} [eV]".format(i)
                for i in range(5, 7)
            ])
            df_E = df[["Solute element name"] + columns_E]
            df_E.columns = (["Solute"] +
                            ["E{} [eV]".format(i) for i in ["₂", "₃", "₄"]] +
                            ["E`{} [eV]".format(i) for i in ["₃", "₄"]] +
                            ["E``{} [eV]".format(i) for i in ["₃", "₄"]] +
                            ["E{} [eV]".format(i) for i in ["₅", "₆"]])
            mpfile.add_data_table(mpid, df_E, "hop_activation_barriers")

            print("add table for hop attempt frequencies for {} (BCC)".format(
                mpid))
            columns_v = ([
                "Hop attempt frequency, v_{} [THz]".format(i)
                for i in range(2, 5)
            ] + [
                "Hop attempt frequency, v'_{} [THz]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop attempt frequency, v''_{} [THz]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop attempt frequency, v_{} [THz]".format(i)
                for i in range(5, 7)
            ])
            df_v = df[["Solute element name"] + columns_v]
            df_v.columns = (["Solute"] +
                            ["v{} [THz]".format(i) for i in ["₂", "₃", "₄"]] +
                            ["v`{} [THz]".format(i) for i in ["₃", "₄"]] +
                            ["v``{} [THz]".format(i) for i in ["₃", "₄"]] +
                            ["v{} [THz]".format(i) for i in ["₅", "₆"]])
            mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies")

        elif hdata["Host"]["crystal_structure"] == "FCC":

            print("add table for hop activation barriers for {} (FCC)".format(
                mpid))
            columns_E = [
                "Hop activation barrier, E_{} [eV]".format(i) for i in range(5)
            ]
            df_E = df[["Solute element name"] + columns_E]
            df_E.columns = ["Solute"] + [
                "E{} [eV]".format(i) for i in ["₀", "₁", "₂", "₃", "₄"]
            ]
            mpfile.add_data_table(mpid, df_E, "hop_activation_barriers")

            print("add table for hop attempt frequencies for {} (FCC)".format(
                mpid))
            columns_v = [
                "Hop attempt frequency, v_{} [THz]".format(i) for i in range(5)
            ]
            df_v = df[["Solute element name"] + columns_v]
            df_v.columns = ["Solute"] + [
                "v{} [THz]".format(i) for i in ["₀", "₁", "₂", "₃", "₄"]
            ]
            mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies")

        elif hdata["Host"]["crystal_structure"] == "HCP":

            print("add table for hop activation barriers for {} (HCP)".format(
                mpid))
            columns_E = [
                "Hop activation barrier, E_X [eV]",
                "Hop activation barrier, E'_X [eV]",
                "Hop activation barrier, E_a [eV]",
                "Hop activation barrier, E'_a [eV]",
                "Hop activation barrier, E_b [eV]",
                "Hop activation barrier, E'_b [eV]",
                "Hop activation barrier, E_c [eV]",
                "Hop activation barrier, E'_c [eV]",
            ]
            df_E = df[["Solute element name"] + columns_E]
            df_E.columns = ["Solute"] + [
                "Eₓ [eV]",
                "E`ₓ [eV]",
                "Eₐ [eV]",
                "E`ₐ [eV]",
                "E_b [eV]",
                "E`_b [eV]",
                "Eꪱ [eV]",
                "E`ꪱ [eV]",
            ]
            mpfile.add_data_table(mpid, df_E, "hop_activation_barriers")

            print("add table for hop attempt frequencies for {} (HCP)".format(
                mpid))
            columns_v = ["Hop attempt frequency, v_a [THz]"
                         ] + ["Hop attempt frequency, v_X [THz]"]
            df_v = df[["Solute element name"] + columns_v]
            df_v.columns = ["Solute"] + ["vₐ [THz]"] + ["vₓ [THz]"]
            mpfile.add_data_table(mpid, df_v, "hop_attempt_frequencies")

    print("DONE")
Пример #11
0
    def to_backgrid_dict(self):
        """Backgrid-conform dict from DataFrame"""
        # shorten global import times by importing django here
        import numpy as np
        from mpcontribs.io.core.utils import get_composition_from_string
        from pandas import MultiIndex
        import pymatgen.util as pmg_util
        from pymatgen.core.composition import CompositionError

        table = dict()
        nrows_max = 260
        nrows = self.shape[0]
        df = Table(self.head(n=nrows_max)) if nrows > nrows_max else self
        numeric_columns = df.select_dtypes(
            include=[np.number]).columns.tolist()

        if isinstance(df.index, MultiIndex):
            df.reset_index(inplace=True)

        table['columns'] = []
        table['rows'] = super(Table, df).to_dict(orient='records')

        for col_index, col in enumerate(list(df.columns)):
            cell_type = 'number'

            # avoid looping rows to minimize use of `df.iat` (time-consuming in 3d)
            if not col.startswith('level_') and col not in numeric_columns:
                is_url_column, prev_unit, old_col = True, None, col

                for row_index in range(df.shape[0]):
                    cell = str(df.iat[row_index, col_index])
                    cell_split = cell.split(' ', 1)

                    if not cell or len(
                            cell_split) == 1:  # empty cell or no space
                        is_url_column = bool(
                            is_url_column
                            and (not cell or mp_id_pattern.match(cell)))
                        if is_url_column:
                            if cell:
                                value = 'https://materialsproject.org/materials/{}'.format(
                                    cell)
                                table['rows'][row_index][col] = value
                        elif cell:
                            try:
                                composition = get_composition_from_string(cell)
                                composition = pmg_util.string.unicodeify(
                                    composition)
                                table['rows'][row_index][col] = composition
                            except (CompositionError, ValueError,
                                    OverflowError):
                                try:
                                    # https://stackoverflow.com/a/38020041
                                    result = urlparse(cell)
                                    if not all([
                                            result.scheme, result.netloc,
                                            result.path
                                    ]):
                                        break
                                    is_url_column = True
                                except:
                                    break

                    else:
                        value, unit = cell_split  # TODO convert cell_split[0] to float?
                        is_url_column = False
                        try:
                            float(value
                                  )  # unit is only a unit if value is number
                        except ValueError:
                            continue
                        table['rows'][row_index].pop(old_col)
                        if prev_unit is None:
                            prev_unit = unit
                            col = '{} [{}]'.format(col, unit)
                        table['rows'][row_index][
                            col] = cell if prev_unit != unit else value

                cell_type = 'uri' if is_url_column else 'string'

            col_split = col.split('##')
            nesting = [col_split[0]] if len(col_split) > 1 else []
            table['columns'].append({
                'name': col,
                'cell': cell_type,
                'nesting': nesting,
                'editable': 0
            })
            if len(col_split) > 1:
                table['columns'][-1].update(
                    {'label': '##'.join(col_split[1:])})
            if len(table['columns']) > 12:
                table['columns'][-1]['renderable'] = 0

        header = RecursiveDict()
        for idx, col in enumerate(table['columns']):
            if 'label' in col:
                k, sk = col['name'].split('##')
                sk_split = sk.split()
                if len(sk_split) == 2:
                    d = {'name': sk_split[0], 'unit': sk_split[1], 'idx': idx}
                    if k not in header:
                        header[k] = [d]
                    else:
                        header[k].append(d)
                elif k in header:
                    header.pop(k)

        for k, skl in header.items():
            units = [sk['unit'] for sk in skl]
            if units.count(units[0]) == len(units):
                for sk in skl:
                    table['columns'][sk['idx']]['label'] = sk['name']
                    table['columns'][sk['idx']]['nesting'][0] = '{} {}'.format(
                        k, sk['unit'])

        return table
Пример #12
0
 def __init__(self, content=RecursiveDict()):
     super(Tables, self).__init__((key, value)
                                  for key, value in content.items()
                                  if isinstance(value, Table))
Пример #13
0
def run(mpfile, **kwargs):

    # extract data from json files
    keys = ['pretty_formula', 'volume']
    input_dir = mpfile.hdata.general['input_dir']
    for idx, obj in enumerate(scandir(input_dir)):
        mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1]
        print(mpid)
        input_file = gzip.open(obj.path, 'rb')
        try:
            data = json.loads(input_file.read())

            # filter out metals
            if 'GGA' not in data['gap'] or data['gap']['GGA'] < 0.1:
                print('GGA gap < 0.1 -> skip')
                continue

            # add hierarchical data (nested key-values)
            # TODO: extreme values for power factor, zT, effective mass
            # TODO: add a text for the description of each table
            hdata = RecursiveDict((k, data[k]) for k in keys)
            hdata['volume'] = u'{:g} ų'.format(hdata['volume'])
            cond_eff_mass = u'mₑᶜᵒⁿᵈ'
            hdata[cond_eff_mass] = RecursiveDict()
            names = [u'e₁', u'e₂', u'e₃', u'<m>']
            if 'GGA' not in data:
                print('no GGA key for', mpid)
                continue
            for dt, d in data['GGA']['cond_eff_mass'].items():
                eff_mass = d['300']['1e+18']
                eff_mass.append(np.mean(eff_mass))
                hdata[cond_eff_mass][dt] = RecursiveDict(
                    (names[idx], u'{:.2f} mₑ'.format(x))
                    for idx, x in enumerate(eff_mass))
            seebeck_fix_dop_temp = "Seebeck"
            hdata[seebeck_fix_dop_temp] = RecursiveDict()
            cols = [u'e₁', u'e₂', u'e₃', 'temperature', 'doping']
            for doping_type in ['p', 'n']:
                sbk = [
                    float(i) for i in data['GGA']['seebeck_doping']
                    [doping_type]['300']['1e+18']['eigs']
                ]
                vals = [u'{:.2e} μV/K'.format(s) for s in sbk] + [
                    u'{} K'.format('300'), u'{} cm⁻³'.format('1e+18')
                ]
                hdata[seebeck_fix_dop_temp][doping_type] = RecursiveDict(
                    (k, v) for k, v in zip(cols, vals))

            # build data and max values for seebeck, conductivity and kappa
            # max/min values computed using numpy. It may be better to code it in pure python.
            cols = ['value', 'temperature', 'doping']
            for prop_name in ['seebeck_doping', 'cond_doping', 'kappa_doping']:
                # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font)
                # and select it as standard font in your browser (leave other fonts as is, esp. fixed width)
                if prop_name[0] == 's':
                    lbl, unit = u"Sₘₐₓ", u"μV/K"
                elif prop_name[0] == 'c':
                    lbl, unit = u"σₘₐₓ", u"(Ωms)⁻¹"
                elif prop_name[0] == 'k':
                    lbl, unit = u"κₑ₋ₘᵢₙ", u"W/(mKs)"
                hdata[lbl] = RecursiveDict()

                for doping_type in ['p', 'n']:
                    prop = data['GGA'][prop_name][doping_type]
                    prop_averages, dopings, columns = [], None, ['T (K)']
                    temps = sorted(map(int, prop.keys()))
                    for temp in temps:
                        row = [temp]
                        if dopings is None:
                            dopings = sorted(map(float,
                                                 prop[str(temp)].keys()))
                        for doping in dopings:
                            doping_str = '%.0e' % doping
                            if len(columns) <= len(dopings):
                                columns.append(doping_str + u' cm⁻³')
                            eigs = prop[str(temp)][doping_str]['eigs']
                            row.append(np.mean(eigs))
                        prop_averages.append(row)

                    arr_prop_avg = np.array(prop_averages)[:, 1:]
                    max_v = np.max(arr_prop_avg)
                    if prop_name[0] == 's' and doping_type == 'n':
                        max_v = np.min(arr_prop_avg)
                    if prop_name[0] == 'k':
                        max_v = np.min(arr_prop_avg)
                    arg_max = np.argwhere(arr_prop_avg == max_v)[0]

                    vals = [
                        u'{:.2e} {}'.format(max_v, unit),
                        u'{:.2e} K'.format(temps[arg_max[0]]),
                        u'{:.2e} cm⁻³'.format(dopings[arg_max[1]])
                    ]
                    hdata[lbl][doping_type] = RecursiveDict(
                        (k, v) for k, v in zip(cols, vals))

            mpfile.add_hierarchical_data(nest_dict(hdata, ['data']),
                                         identifier=data['mp_id'])

        finally:
            input_file.close()
Пример #14
0
    def build(self, contributor_email, cid, api_key=None, endpoint=None):
        """update materials/compositions collections with contributed data"""
        cid_short, cid_str = get_short_object_id(cid), str(cid)
        contrib = self.find_contribution(cid)
        if not contrib:
            raise Exception('Contribution {} not found!'.format(cid))
        if contributor_email not in contrib['collaborators']:
            raise ValueError(
                "Build stopped: building contribution {} not "
                "allowed due to insufficient permissions of {}! Ask "
                "someone of {} to make you a collaborator on {}.".format(
                    cid_short, contributor_email, contrib['collaborators'],
                    cid_short))
        from pymatgen.util.provenance import Author
        mpfile = MPFileCore.from_contribution(contrib)
        mp_cat_id = mpfile.ids[0]
        is_mp_id = mp_id_pattern.match(mp_cat_id)
        self.curr_coll = self.materials if is_mp_id else self.compositions
        author = Author.parse_author(contributor_email)
        project = str(author.name).translate(None, '.') \
                if 'project' not in contrib else contrib['project']

        nb = nbf.new_notebook()
        if isinstance(self.db, dict):
            contrib.pop('_id')
            if 'cid' in contrib['content']:
                contrib['content'].pop('cid')
            nb['cells'].append(
                nbf.new_code_cell(
                    "from mpcontribs.io.core.mpfile import MPFileCore\n"
                    "from mpcontribs.io.core.recdict import RecursiveDict\n"
                    "mpfile = MPFileCore.from_contribution({})\n"
                    "identifier = '{}'".format(contrib, mp_cat_id)))
        else:
            nb['cells'].append(
                nbf.new_code_cell(
                    "from mpcontribs.rest.rester import MPContribsRester"))
            os.environ['PMG_MAPI_KEY'] = api_key
            os.environ['PMG_MAPI_ENDPOINT'] = endpoint
            nb['cells'].append(
                nbf.new_code_cell(
                    "with MPContribsRester() as mpr:\n"
                    "    mpfile = mpr.find_contribution('{}')\n"
                    "    identifier = mpfile.ids[0]".format(cid)))
        nb['cells'].append(
            nbf.new_markdown_cell("## Contribution #{} for {}".format(
                cid_short, mp_cat_id)))
        nb['cells'].append(nbf.new_markdown_cell("### Hierarchical Data"))
        nb['cells'].append(nbf.new_code_cell("mpfile.hdata[identifier]"))
        if mpfile.tdata[mp_cat_id]:
            nb['cells'].append(nbf.new_markdown_cell("### Tabular Data"))
        for table_name, table in mpfile.tdata[mp_cat_id].iteritems():
            nb['cells'].append(
                nbf.new_markdown_cell("#### {}".format(table_name)))
            nb['cells'].append(
                nbf.new_code_cell(
                    "mpfile.tdata[identifier]['{}']".format(table_name)))
        if mpfile.gdata[mp_cat_id]:
            nb['cells'].append(nbf.new_markdown_cell("### Graphical Data"))
        for plot_name, plot in mpfile.gdata[mp_cat_id].iteritems():
            nb['cells'].append(
                nbf.new_markdown_cell("#### {}".format(plot_name)))
            nb['cells'].append(
                nbf.new_code_cell(
                    "mpfile.gdata[identifier]['{}']".format(plot_name)))

        if mpfile.sdata[mp_cat_id]:
            nb['cells'].append(nbf.new_markdown_cell("### Structural Data"))
        for structure_name, structure in mpfile.sdata[mp_cat_id].iteritems():
            nb['cells'].append(
                nbf.new_markdown_cell("#### {}".format(structure_name)))
            nb['cells'].append(
                nbf.new_code_cell(
                    "mpfile.sdata[identifier]['{}']".format(structure_name)))

        self.ep.preprocess(nb, {'metadata': {'path': self.nbdir}})

        if isinstance(self.db, dict):
            return [mp_cat_id, project, cid_short, export_notebook(nb, cid)]
        else:
            build_doc = RecursiveDict()
            build_doc['mp_cat_id'] = mp_cat_id
            build_doc['project'] = project
            build_doc['nb'] = nb
            self.curr_coll.update({'_id': cid}, {'$set': build_doc},
                                  upsert=True)
            return '{}/{}'.format(  # return URL for contribution page
                ('materials' if is_mp_id else 'compositions'), cid_str)
Пример #15
0
def run(mpfile, hosts=None, download=False, **kwargs):
    #mpfile.unique_mp_cat_ids = False
    from pymatgen import MPRester
    mpr = MPRester()

    fpath = os.path.join(os.environ['HOME'], 'work',
                         'dilute_solute_diffusion.xlsx')

    if download or not os.path.exists(fpath):

        figshare_id = mpfile.hdata.general['info']['figshare_id']
        url = 'https://api.figshare.com/v2/articles/{}'.format(figshare_id)
        print 'get figshare article {}'.format(figshare_id)
        r = requests.get(url)
        figshare = json.loads(r.content)
        mpfile.document['_hdata']['version'] = figshare['version']

        print 'read excel from figshare into DataFrame'
        df_dct = None
        for d in figshare['files']:
            if 'xlsx' in d['name']:
                # Dict of DataFrames is returned, with keys representing sheets
                df_dct = read_excel(d['download_url'], sheet_name=None)
                break
        if df_dct is None:
            print 'no excel sheet found on figshare'
            return

        print 'save excel to disk'
        writer = ExcelWriter(fpath)
        for sheet, df in df_dct.items():
            df.to_excel(writer, sheet)
        writer.save()

    else:
        df_dct = read_excel(fpath, sheet_name=None)

    print len(df_dct), 'sheets loaded.'

    print 'looping hosts ...'
    host_info = df_dct['Host Information']
    host_info.set_index(host_info.columns[0], inplace=True)
    host_info.dropna(inplace=True)

    for idx, host in enumerate(host_info):
        if hosts is not None:
            if isinstance(hosts, int) and idx + 1 > hosts:
                break
            elif isinstance(hosts, list) and not host in hosts:
                continue

        print 'get mp-id for {}'.format(host)
        mpid = None
        for doc in mpr.query(criteria={'pretty_formula': host},
                             properties={'task_id': 1}):
            if doc['sbxd'][0]['decomposes_to'] is None:
                mpid = doc['task_id']
                break
        if mpid is None:
            print 'mp-id for {} not found'.format(host)
            continue

        print 'add host info for {}'.format(mpid)
        hdata = host_info[host].to_dict(into=RecursiveDict)
        for k in hdata.keys():
            v = hdata.pop(k)
            ks = k.split()
            if ks[0] not in hdata:
                hdata[ks[0]] = RecursiveDict()
            unit = ks[-1][1:-1] if ks[-1].startswith('[') else ''
            subkey = '_'.join(ks[1:-1] if unit else ks[1:]).split(',')[0]
            if subkey == "lattice_constant":
                unit = u'Å'
            try:
                hdata[ks[0]][subkey] = clean_value(
                    v, unit.replace('angstrom', u'Å'))
            except ValueError:
                hdata[ks[0]][subkey] = v
        hdata['formula'] = host
        df = df_dct['{}-X'.format(host)]
        rows = list(isnull(df).any(1).nonzero()[0])
        if rows:
            cells = df.ix[rows].dropna(how='all').dropna(axis=1)[df.columns[0]]
            note = cells.iloc[0].replace('following', cells.iloc[1])[:-1]
            hdata['note'] = note
            df.drop(rows, inplace=True)
        mpfile.add_hierarchical_data(nest_dict(hdata, ['data']),
                                     identifier=mpid)

        print 'add table for D₀/Q data for {}'.format(mpid)
        df.set_index(df['Solute element number'], inplace=True)
        df.drop('Solute element number', axis=1, inplace=True)
        df.columns = df.ix[0]
        df.index.name = 'index'
        df.drop('Solute element name', inplace=True)
        df = df.T.reset_index()
        if str(host) == 'Fe':
            df_D0_Q = df[[
                'Solute element name', 'Solute D0, paramagnetic [cm^2/s]',
                'Solute Q, paramagnetic [eV]'
            ]]
        elif hdata['Host']['crystal_structure'] == 'HCP':
            df_D0_Q = df[[
                'Solute element name', 'Solute D0 basal [cm^2/s]',
                'Solute Q basal [eV]'
            ]]
        else:
            df_D0_Q = df[[
                'Solute element name', 'Solute D0 [cm^2/s]', 'Solute Q [eV]'
            ]]
        df_D0_Q.columns = ['El.', 'D₀ [cm²/s]', 'Q [eV]']
        mpfile.add_data_table(mpid, df_D0_Q, 'D₀_Q')

        if hdata['Host']['crystal_structure'] == 'BCC':

            print 'add table for hop activation barriers for {} (BCC)'.format(
                mpid)
            columns_E = [
                'Hop activation barrier, E_{} [eV]'.format(i)
                for i in range(2, 5)
            ] + [
                "Hop activation barrier, E'_{} [eV]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop activation barrier, E''_{} [eV]".format(i)
                for i in range(3, 5)
            ] + [
                'Hop activation barrier, E_{} [eV]'.format(i)
                for i in range(5, 7)
            ]
            df_E = df[['Solute element name'] + columns_E]
            df_E.columns = ['El.'] + [
                'E{} [eV]'.format(i) for i in ['₂', '₃', '₄']
            ] + ['E`{} [eV]'.format(i) for i in ['₃', '₄']] + [
                'E``{} [eV]'.format(i) for i in ['₃', '₄']
            ] + ['E{} [eV]'.format(i) for i in ['₅', '₆']]
            mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers')

            print 'add table for hop attempt frequencies for {} (BCC)'.format(
                mpid)
            columns_v = [
                'Hop attempt frequency, v_{} [THz]'.format(i)
                for i in range(2, 5)
            ] + [
                "Hop attempt frequency, v'_{} [THz]".format(i)
                for i in range(3, 5)
            ] + [
                "Hop attempt frequency, v''_{} [THz]".format(i)
                for i in range(3, 5)
            ] + [
                'Hop attempt frequency, v_{} [THz]'.format(i)
                for i in range(5, 7)
            ]
            df_v = df[['Solute element name'] + columns_v]
            df_v.columns = ['El.'] + [
                'v{} [THz]'.format(i) for i in ['₂', '₃', '₄']
            ] + ['v``{} [THz]'.format(i) for i in ['₃', '₄']] + [
                'v``{} [THz]'.format(i) for i in ['₃', '₄']
            ] + ['v{} [THz]'.format(i) for i in ['₅', '₆']]
            mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies')

        elif hdata['Host']['crystal_structure'] == 'FCC':

            print 'add table for hop activation barriers for {} (FCC)'.format(
                mpid)
            columns_E = [
                'Hop activation barrier, E_{} [eV]'.format(i) for i in range(5)
            ]
            df_E = df[['Solute element name'] + columns_E]
            df_E.columns = ['El.'] + [
                'E{} [eV]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄']
            ]
            mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers')

            print 'add table for hop attempt frequencies for {} (FCC)'.format(
                mpid)
            columns_v = [
                'Hop attempt frequency, v_{} [THz]'.format(i) for i in range(5)
            ]
            df_v = df[['Solute element name'] + columns_v]
            df_v.columns = ['El.'] + [
                'v{} [THz]'.format(i) for i in ['₀', '₁', '₂', '₃', '₄']
            ]
            mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies')

        elif hdata['Host']['crystal_structure'] == 'HCP':

            print 'add table for hop activation barriers for {} (HCP)'.format(
                mpid)
            columns_E = [
                "Hop activation barrier, E_X [eV]",
                "Hop activation barrier, E'_X [eV]",
                "Hop activation barrier, E_a [eV]",
                "Hop activation barrier, E'_a [eV]",
                "Hop activation barrier, E_b [eV]",
                "Hop activation barrier, E'_b [eV]",
                "Hop activation barrier, E_c [eV]",
                "Hop activation barrier, E'_c [eV]"
            ]
            df_E = df[['Solute element name'] + columns_E]
            df_E.columns = ['El.'] + [
                'Eₓ [eV]', 'E`ₓ [eV]', 'Eₐ [eV]', 'E`ₐ [eV]', 'E_b [eV]',
                'E`_b [eV]', 'Eꪱ [eV]', 'E`ꪱ [eV]'
            ]
            mpfile.add_data_table(mpid, df_E, 'hop_activation_barriers')

            print 'add table for hop attempt frequencies for {} (HCP)'.format(
                mpid)
            columns_v = ['Hop attempt frequency, v_a [THz]'
                         ] + ['Hop attempt frequency, v_X [THz]']
            df_v = df[['Solute element name'] + columns_v]
            df_v.columns = ['El.'] + ['vₐ [THz]'] + ['vₓ [THz]']
            mpfile.add_data_table(mpid, df_v, 'hop_attempt_frequencies')

            print mpfile
    print 'DONE'
Пример #16
0
def run(mpfile, nmax=None, dup_check_test_site=True):

    existing_mpids = {}
    for b in [False, True]:
        with DibbsRester(test_site=b) as mpr:
            for doc in mpr.query_contributions(criteria=mpr.dibbs_query):
                existing_mpids[doc['mp_cat_id']] = doc['_id']
        if not dup_check_test_site:
            break

    general = mpfile.document[mp_level01_titles[0]]
    input_file = general.pop('input_file')
    df = read_excel(input_file)
    columns_map = RecursiveDict([
        (v, k) for k, v in general.pop('columns_map').items()
    ])
    columns = columns_map.keys()
    df = df[columns]
    df = df[notnull(df[columns[-1]]) & notnull(df[columns[1]])]
    mpfile.add_hierarchical_data({'title': 'DIBBS - 27Al NMR'})

    count, skipped, update = 0, 0, 0
    for idx, row in df.iterrows():
        url = row[columns[-1]]
        if not url.startswith('http'):
            continue

        # hierarchical data
        d = RecursiveDict()
        for col in columns[:4]:
            d[columns_map[col]] = unidecode(row[col]) \
                    if isinstance(row[col], six.string_types) else row[col]

        if d['name'] in [
                'AlPO4 Tridymite (AlPO4-t)', 'Basic aluminum sulfate',
                'Cryolite', 'berlinite(AlPO4-Q)'
        ]:
            continue

        d['data'] = RecursiveDict()
        for col in columns[4:8]:
            if notnull(row[col]):
                value = unicode('{}'.format(row[col]), 'utf-8')
                if col == columns[4]:
                    value += ' ppm'
                elif col == columns[6]:
                    value += ' MHz'
                elif col == columns[7]:
                    value = ' '.join([value[:-1], value[-1]])
            else:
                value = u''
            d['data'][columns_map[col]] = value

        # structure
        if url.startswith('https://materialsproject.org'):
            mpid = url.split('/')[-2]
        else:
            #print 'retrieve cif and match to MP structure ...'
            d[columns_map[columns[-1]]] = url
            f = requests.get(url)

            try:
                mpid = mpfile.add_structure(f.text, name=d['name'], fmt='cif')
            except ValueError as ex:
                print d['name'], str(ex)
                continue

            if nmax is not None and mpid in existing_mpids:
                item = mpfile.document.popitem(last=True)
                print 'removed duplicate', mpid

        if nmax is not None and mpid in existing_mpids:
            print 'skipping', mpid
            skipped += 1
            continue  # skip duplicates

        mpfile.add_hierarchical_data(d, identifier=mpid)
        print 'added {} ({})'.format(d['name'], mpid)

        if mpid in existing_mpids:
            cid = existing_mpids[mpid]
            mpfile.insert_id(mpid, cid)
            update += 1
        if nmax is not None and count >= nmax - 1:
            break
        count += 1

    print len(mpfile.ids), 'mp-ids to submit.'
    if nmax is None and update > 0:
        print update, 'mp-ids to update.'
    if nmax is not None and skipped > 0:
        print skipped, 'duplicates to skip.'
Пример #17
0
class DataGenerator(object):
    """generate MP-like data from baseball database

    database: http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip
    """
    def __init__(self):
        try:
            from faker import Faker
            self.fake = Faker()
        except:
            self.fake = None
        self.master = os.path.join(csv_database, 'Master.csv')
        self.player = None
        self.player_id = None
        self.player_info = None
        self.player_data = None

    def set_player(self):
        """retrieve player from master file as pandas.Series"""
        df = read_csv(self.master, index_col=0)
        self.player_id = self.fake.random_element(elements=df.index)
        self.player = df.xs(self.player_id).dropna()

    def _split_string_at_caps(self, string):
        return re.split(r'([A-Z][a-z]*)', string)[:-1]

    def organize_player_info(self):
        """organize player info into nested dict"""
        splits = map(self._split_string_at_caps, self.player.index)
        counter = Counter([ el[0] for el in splits if el ])
        subsecs = [key for key,cnt in counter.iteritems() if cnt > 1]
        self.player_info = RecursiveDict({})
        for k,v in self.player.iteritems():
            keys = self._split_string_at_caps(k)
            nested = {keys[0]: {keys[1]: v}} if (
                keys and keys[0] in subsecs
            ) else {'other': {k: v}}
            self.player_info.rec_update(nested)

    def generate_dataset_for_player(self):
        """generate a dataset for a player"""
        for file_name in os.listdir(csv_database):
            if file_name == 'Master.csv': continue
            try:
                df = read_csv(os.path.join(csv_database, file_name))
            except:
                continue
            if 'playerID' not in df.columns: continue
            dataset = df[df['playerID']==self.player_id].dropna()
            if dataset.empty or dataset.shape[0] < 2: continue
            cols = [
                col for col in dataset.columns
                if not dataset[col].sum()
            ]
            self.player_data = dataset.drop(cols+['playerID'], axis=1)
            if self.player_data is not None:
              break

    def init(self, keep_dataset=False):
        """call all setters for a player"""
        if not keep_dataset:
            self.set_player()
            self.organize_player_info()
        self.generate_dataset_for_player()
        if self.player_data is None:
          # try different player if no dataset found
          self.init(keep_dataset=keep_dataset)
Пример #18
0
def run(mpfile, **kwargs):

    indir = "/Users/patrick/Downloads/ThinFilmPV"
    summary_data = json.load(open(os.path.join(indir, "SUMMARY.json"), "r"))
    absorption_data = json.load(
        open(os.path.join(indir, "ABSORPTION-CLIPPED.json"), "r"))
    dos_data = json.load(open(os.path.join(indir, "DOS.json"), "r"))
    formulae_data = json.load(
        open(os.path.join(indir, "FORMATTED-FORMULAE.json"), "r"))
    config = RecursiveDict([
        ("SLME_500_nm", ["SLME|500nm", "%"]),
        ("SLME_1000_nm", ["SLME|1000nm", "%"]),
        ("E_g", ["ΔE.corrected", "eV"]),
        ("E_g_d", ["ΔE.direct", "eV"]),
        ("E_g_da", ["ΔE.dipole-allowed", "eV"]),
        ("m_e", ["mᵉ", "mₑ"]),
        ("m_h", ["mʰ", "mₑ"]),
    ])

    print(len(summary_data.keys()))
    for mp_id, d in summary_data.items():
        print(mp_id)
        formula = formulae_data[mp_id].replace("<sub>",
                                               "").replace("</sub>", "")
        query = {"identifier": mp_id, "project": "screening_inorganic_pv"}
        # r = db.contributions.update_one(query, {'$set': {'content.data.formula': formula}})
        # print(r.modified_count)
        # continue

        rd = RecursiveDict({"formula": formula})
        for k, v in config.items():
            value = clean_value(d[k], v[1], max_dgts=4)
            if not "." in v[0]:
                rd[v[0]] = value
            else:
                keys = v[0].split(".")
                if not keys[0] in rd:
                    rd[keys[0]] = RecursiveDict({keys[1]: value})
                else:
                    rd[keys[0]][keys[1]] = value

        mpfile.add_hierarchical_data({"data": rd}, identifier=mp_id)

        doc = query.copy()
        doc["content.data"] = mpfile.document[mp_id]["data"]
        doc["collaborators"] = [{
            "name": "Patrick Huck",
            "email": "*****@*****.**"
        }]
        # r = db.contributions.update_one(query, {'$set': doc}, upsert=True)
        # cid = r.upserted_id
        cid = db.contributions.find_one(query, {"_id": 1})["_id"]

        df = DataFrame(data=absorption_data[mp_id])
        df.columns = ["hν [eV]", "α [cm⁻¹]"]
        mpfile.add_data_table(mp_id, df, "absorption")
        table = mpfile.document[mp_id]["absorption"]
        table.pop("@module")
        table.pop("@class")
        table["identifier"] = mp_id
        table["project"] = "screening_inorganic_pv"
        table["name"] = "absorption"
        table["cid"] = cid
        # r = db.tables.insert_one(table)
        # tids = [r.inserted_id]
        r = db.tables.update_one(
            {
                "identifier": mp_id,
                "project": "screening_inorganic_pv",
                "name": "absorption",
                "cid": cid,
            },
            {"$set": table},
        )
        print(len(table["data"]), r.modified_count)
Пример #19
0
 def from_items(cls, rdct, **kwargs):
     return super(Table, cls).from_dict(RecursiveDict(rdct), **kwargs)
Пример #20
0
def run(mpfile, include_cifs=True, **kwargs):

    from pymatgen.core.composition import Composition
    from pymatgen.core.structure import Structure

    data_input = mpfile.document[mp_level01_titles[0]].pop('input')
    phase_names = mpfile.hdata.general['phase_names']
    dir_path = os.path.dirname(os.path.realpath(__file__))
    for k in data_input.keys():
        data_input[k] = os.path.join(dir_path, data_input[k])

    with open(data_input['formatted_entries'], "r") as fin:
        mp_contrib_phases = json.loads(fin.read())
    with open(data_input['hull_entries'], "r") as fin:
        hull_states = json.loads(fin.read())
    with open(data_input['mpid_existing'], 'r') as fin:
        mp_dup = json.loads(fin.read())
    with open(data_input['mpid_new'], 'r') as fin:
        mp_cmp = json.loads(fin.read())

    ################################################################################################################
    # add unique structures first (special cases)
    ################################################################################################################

    if include_cifs:
        for hstate in hull_states:
            if 'other' == hstate['phase']:
                c = Composition.from_dict(hstate['c'])
                s = Structure.from_dict(hstate['s'])
                for mpid in mpfile.ids:
                    formula = mpfile.hdata[mpid]['data']['Formula']
                    if c.almost_equals(Composition(formula)):
                        try:
                            mpfile.add_structure(s, identifier=mpid)
                            print formula, 'added to', mpid
                        except Exception as ex:
                            print 'tried to add structure to', mpid, 'but', str(
                                ex)
                        break

    # "phase": 'postspinel-NaMn2O4', "Formula": 'Na0.5MnO2',
    # "ΔH (eV/mol)": -1.415, "ΔHₕ (eV/mol)": '', "Ground state?": 'Y',

    ################################################################################################################
    # Get mp-ids for all entries based on matching the VASP directory path names
    # Paths are different in the existing and new mp-id dictionary, so processing has to be independent
    ################################################################################################################

    print 'get all mp-ids based on VASP directory paths ...'

    for framework, fdat in mp_contrib_phases.items():
        for i, phase in enumerate(fdat):
            c = Composition(phase[0])
            for hstate in hull_states:
                if phase_names[framework] == hstate['phase'] and \
                        c.almost_equals(Composition.from_dict(hstate['c'])) and \
                        len(mp_contrib_phases[framework][i]) < 6:
                    mp_contrib_phases[framework][i].append(hstate['path'])
                    mp_contrib_phases[framework][i].append(hstate['s'])

    for framework, fdat in mp_contrib_phases.items():
        for i, phase in enumerate(fdat):
            match_path = phase[4].replace('all_states/', '')
            mp_ids = []
            for path, ids in mp_dup.items():
                mp_path = path.replace(
                    '/Users/patrick/Downloads/20160710_MPContrib_MnO2_DK/',
                    '').replace('/3.double_relax/CONTCAR', '')
                if match_path == mp_path:
                    mp_ids.extend(ids)
            for path, id_dat in mp_cmp.items():
                mp_path = path.replace(
                    '20160725_MnO2_DK_Cifs/20160710_MPContrib_MnO2_DK-',
                    '').replace('-3.double_relax-CONTCAR.cif',
                                '').replace('-', '/')
                if match_path == mp_path:
                    if 'mp_id' in id_dat.keys():
                        mp_ids.append(id_dat['mp_id'])

            mp_contrib_phases[framework][i].append(mp_ids)

    ################################################################################################################
    # For structures that have mp-ids, add them to the contribution dictionary.
    # For those that don't, run a separate dictionary to keep track of them
    ################################################################################################################

    print 'add structures with mp-ids to contribution ...'

    no_id_dict = {}

    for framework, fdat in mp_contrib_phases.items():
        for phase in fdat:
            d = RecursiveDict()
            d["Phase"] = framework
            d["Formula"] = phase[0]
            try:
                float(phase[1])
                d["ΔH"] = clean_value(phase[1], 'eV/mol')
            except:
                d["ΔH"] = 'N/A eV/mol'
            try:
                float(phase[3])
                d["ΔHₕ"] = clean_value(phase[3], 'eV/mol')
            except:
                d["ΔHₕ"] = 'N/A eV/mol'
            d["GS"] = 'Yes' if phase[2] == 'Y' else 'No'
            if len(phase[6]) == 0:
                print 'no id for', d['Formula'], d['Phase']
                no_id_dict[phase[4].replace('all_states/', '')] = d
            for mpid in phase[6]:
                if include_cifs:
                    try:
                        mpfile.add_structure(phase[5], identifier=mpid)
                        print framework, phase[0], 'added to', mpid
                    except ValueError as ex:
                        print 'tried to add structure to', mpid, 'but', str(ex)
                mpfile.add_hierarchical_data(RecursiveDict({'data': d}),
                                             identifier=mpid)
                print 'added', mpid
Пример #21
0
    def from_string(data):
        # use archieml-python parse to import data
        rdct = RecursiveDict(loads(data))
        rdct.rec_update()

        # post-process internal representation of file contents
        for key in list(rdct.keys()):
            is_general, root_key = normalize_root_level(key)

            if is_general:
                # make part of shared (meta-)data, i.e. nest under `general` at
                # the beginning of the MPFile
                if mp_level01_titles[0] not in rdct:
                    rdct[mp_level01_titles[0]] = RecursiveDict()
                    rdct.move_to_end(mp_level01_titles[0], last=False)

            # normalize identifier key (pop & insert)
            # using rec_update since we're looping over all entries
            # also: support data in bare tables (marked-up only by
            #       root-level identifier) by nesting under 'data'
            value = rdct.pop(key)
            keys = [mp_level01_titles[0]] if is_general else []
            keys.append(root_key)
            if isinstance(value, list):
                keys.append("table")
            rdct.rec_update(nest_dict(value, keys))

            # reference to section to iterate or parse as CIF
            section = (rdct[mp_level01_titles[0]][root_key]
                       if is_general else rdct[root_key])

            # iterate to find CSV sections to parse
            # also parse propnet quantities
            if isinstance(section, dict):
                scope = []
                for k, v in section.iterate():
                    level, key = k
                    key = "".join([replacements.get(c, c) for c in key])
                    level_reduction = bool(level < len(scope))
                    if level_reduction:
                        del scope[level:]
                    if v is None:
                        scope.append(key)
                    elif isinstance(v, list) and isinstance(v[0], dict):
                        table = ""
                        for row_dct in v:
                            table = "\n".join([table, row_dct["value"]])
                        pd_obj = read_csv(table)
                        d = nest_dict(pd_obj.to_dict(), scope + [key])
                        section.rec_update(d, overwrite=True)
                        if not is_general and level == 0:
                            section.insert_default_plot_options(pd_obj, key)
                    elif (Quantity is not None
                          and isinstance(v, six.string_types) and " " in v):
                        quantity = Quantity.from_key_value(key, v)
                        d = nest_dict(quantity.as_dict(), scope +
                                      [key])  # TODO quantity.symbol.name
                        section.rec_update(d, overwrite=True)

            # convert CIF strings into pymatgen structures
            if mp_level01_titles[3] in section:
                from pymatgen.io.cif import CifParser

                for name in section[mp_level01_titles[3]].keys():
                    cif = section[mp_level01_titles[3]].pop(name)
                    parser = CifParser.from_string(cif)
                    structure = parser.get_structures(primitive=False)[0]
                    section[mp_level01_titles[3]].rec_update(
                        nest_dict(structure.as_dict(), [name]))

        return MPFile.from_dict(rdct)
Пример #22
0
def run(mpfile, nmax=None, dup_check_test_site=True):

    existing_mpids = {}
    for b in [False, True]:
        with DibbsRester(test_site=b) as mpr:
            for doc in mpr.query_contributions(criteria=mpr.dibbs_query):
                existing_mpids[doc['mp_cat_id']] = doc['_id']
        if not dup_check_test_site:
            break

    general = mpfile.document[mp_level01_titles[0]]
    input_file = general.pop('input_file')
    df = read_excel(input_file)
    columns_map = RecursiveDict([
        (v, k) for k, v in general.pop('columns_map').items()
    ])
    columns = columns_map.keys()
    df = df[columns]
    df = df[notnull(df[columns[-1]]) & notnull(df[columns[1]])]
    mpfile.add_hierarchical_data({'title': 'DIBBS - 27Al NMR'})

    count, skipped, update = 0, 0, 0
    for idx, row in df.iterrows():
        url = row[columns[-1]]
        if not url.startswith('http'):
            continue

        # hierarchical data
        d = RecursiveDict()
        for col in columns[:4]:
            d[columns_map[col]] = unidecode(row[col]) \
                    if isinstance(row[col], six.string_types) else row[col]

        if d['name'] in [
            'AlPO4 Tridymite (AlPO4-t)', 'Basic aluminum sulfate', 'Cryolite',
            'berlinite(AlPO4-Q)'
        ]:
            continue

        d['data'] = RecursiveDict()
        for col in columns[4:8]:
            if notnull(row[col]):
                value = unicode('{}'.format(row[col]), 'utf-8')
                if col == columns[4]:
                    value += ' ppm'
                elif col == columns[6]:
                    value += ' MHz'
                elif col == columns[7]:
                    value = ' '.join([value[:-1], value[-1]])
            else:
                value = u''
            d['data'][columns_map[col]] = value

        # structure
        if url.startswith('https://materialsproject.org'):
            mpid = url.split('/')[-2]
        else:
            #print 'retrieve cif and match to MP structure ...'
            d[columns_map[columns[-1]]] = url
            f = requests.get(url)

            try:
                mpid = mpfile.add_structure(f.text, name=d['name'], fmt='cif')
            except ValueError as ex:
                print d['name'], str(ex)
                continue

            if nmax is not None and mpid in existing_mpids:
                item = mpfile.document.popitem(last=True)
                print 'removed duplicate', mpid

        if nmax is not None and mpid in existing_mpids:
            print 'skipping', mpid
            skipped += 1
            continue # skip duplicates

        mpfile.add_hierarchical_data(d, identifier=mpid)
        print 'added {} ({})'.format(d['name'], mpid)

        if mpid in existing_mpids:
            cid = existing_mpids[mpid]
            mpfile.insert_id(mpid, cid)
            update += 1
        if nmax is not None and count >= nmax-1:
            break
        count += 1

    print len(mpfile.ids), 'mp-ids to submit.'
    if nmax is None and update > 0:
        print update, 'mp-ids to update.'
    if nmax is not None and skipped > 0:
        print skipped, 'duplicates to skip.'
Пример #23
0
def run(mpfile, nmax=None, dup_check_test_site=True):

    existing_mpids = {}
    for b in [False, True]:
        with PerovskitesDiffusionRester(test_site=b) as mpr:
            for doc in mpr.query_contributions(projection={
                    'content.data.directory': 1,
                    'mp_cat_id': 1
            }):
                key = '_'.join(
                    [doc['mp_cat_id'], doc['content']['data']['directory']])
                existing_mpids[key] = doc['_id']
        if not dup_check_test_site:
            break

    general = mpfile.document[mp_level01_titles[0]]
    google_sheet = general.pop('google_sheet') + '/export?format=xlsx'
    contcars_filepath = general.pop('contcars_filepath')
    contcars = tarfile.open(contcars_filepath)

    df = read_excel(google_sheet)
    keys = df.iloc[[0]].to_dict(orient='records')[0]
    abbreviations = RecursiveDict()

    count, skipped, update = 0, 0, 0
    for index, row in df[1:].iterrows():
        mpid = None
        data = RecursiveDict()
        mpfile_single = MPFile()

        for col, value in row.iteritems():
            if col == 'level_0' or col == 'index':
                continue
            key = keys[col]
            if isinstance(key, string_types):
                key = key.strip()
                if not key in abbreviations:
                    abbreviations[key] = col
            else:
                key = col.strip().lower()

            if key == 'pmgmatchid':
                mpid = value.strip()
                if mpid == 'None':
                    mpid = None
                name = '_'.join(data['directory'].split('/')[1:])
                contcar_path = 'bulk_CONTCARs/{}_CONTCAR'.format(
                    data['directory'].replace('/', '_'))
                contcar = contcars.extractfile(contcar_path)
                mpid_match = mpfile_single.add_structure(contcar.read(),
                                                         fmt='poscar',
                                                         name=name,
                                                         identifier=mpid)
                if not mp_id_pattern.match(mpid_match):
                    print 'skipping', name
                    continue
                mpid = mpid_match
            else:
                data[key] = value

        if mpid is None:
            continue

        mpid_mod = '_'.join([mpid, data['directory']])
        if nmax is not None and mpid_mod in existing_mpids:
            print 'skipping', mpid_mod
            skipped += 1
            continue  # skip duplicates

        mpfile_single.add_hierarchical_data({'data': data}, identifier=mpid)

        if mpid_mod in existing_mpids:
            cid = existing_mpids[mpid_mod]
            mpfile_single.insert_id(mpid, cid)
            update += 1

        mpfile.concat(mpfile_single)

        if nmax is not None and count >= nmax - 1:
            break
        count += 1

    mpfile.add_hierarchical_data({'abbreviations': abbreviations})

    print len(mpfile.ids), 'mp-ids to submit.'
    if nmax is None and update > 0:
        print update, 'mp-ids to update.'
    if nmax is not None and skipped > 0:
        print skipped, 'duplicates to skip.'
Пример #24
0
def get_card(request, cid, db_type=None, mdb=None):
    """
    @api {post} /card/:cid?API_KEY=:api_key Contribution Card/Preview
    @apiVersion 0.2.0
    @apiName PostGetCard
    @apiGroup Contribution

    @apiDescription Either returns a string containing html for hierarchical
    data, or if existent, a list of URLs for static versions of embedded graphs.

    @apiParam {String} api_key User's unique API_KEY
    @apiParam {json} provenance_keys List of provenance keys

    @apiSuccess {String} created_at Response timestamp
    @apiSuccess {Bool} valid_response Response is valid
    @apiSuccess {String} response Response preview of h- or t-data/graphs ("card")

    @apiSuccessExample Success-Response:
        HTTP/1.1 200 OK
        {
            "created_at": "2017-08-09T19:59:59.936618",
            "valid_response": true,
            "response": ["<graph-url>"]
        }
    """
    from mpcontribs.io.core.components import Tree, Plots, render_plot
    from mpcontribs.io.core.utils import nested_dict_iter
    from mpcontribs.io.core.recdict import RecursiveDict, render_dict
    from django.template import Template, Context
    from django.core.urlresolvers import reverse
    from mpcontribs.config import mp_id_pattern
    prov_keys = loads(request.POST.get('provenance_keys', '["title"]'))
    contrib = mdb.contrib_ad.query_contributions(
        {'_id': ObjectId(cid)},
        projection={'_id': 0, 'mp_cat_id': 1, 'content': 1, 'collaborators': 1}
    )[0]
    mpid = contrib['mp_cat_id']
    hdata = Tree(contrib['content'])
    plots = Plots(contrib['content'])
    title = hdata.get('title', 'No title available.')
    descriptions = hdata.get('description', 'No description available.').strip().split('.', 1)
    description = '{}.'.format(descriptions[0])
    if len(descriptions) > 1 and descriptions[1]:
        description += '''<a href="#"
        class="read_more">More &raquo;</a><span class="more_text"
        hidden>{}</span>'''.format(descriptions[1])
    authors = hdata.get('authors', 'No authors available.').split(',', 1)
    provenance = '<h5>{}'.format(authors[0])
    if len(authors) > 1:
        provenance += '''<button class="btn-sm btn-link" type=button
        data-toggle="tooltip" data-placement="bottom"
        data-container="body" title="{}" style="padding: 0px 0px 0px 3px;"
        >et al.</a>'''.format(authors[1].strip())
    provenance += '</h5>'
    dois = hdata.get('dois', hdata.get('urls', '')).split(' ')
    doi_urls = []
    for x in dois:
        if x.startswith('http'):
            doi_urls.append(x)
        else:
            doi_urls.append('https://doi.org/{}'.format(x))
    provenance += ''.join(['''<a href={}
        class="btn btn-link" role=button style="padding: 0"
        target="_blank"><i class="fa fa-book fa-border fa-lg"></i></a>'''.format(x, y)
        for x, y in zip(doi_urls, dois) if x
    ])
    #if plots:
    #    card = []
    #    for name, plot in plots.items():
    #        filename = '{}_{}.png'.format(mpid, name)
    #        cwd = os.path.dirname(__file__)
    #        filepath = os.path.abspath(os.path.join(
    #            cwd, '..', '..', 'webtzite', 'static', 'img', filename
    #        ))
    #        if not os.path.exists(filepath):
    #            render_plot(plot, filename=filepath)
    #        index = request.build_absolute_uri(reverse('webtzite_index')[:-1])
    #        imgdir = '/'.join([index.rsplit('/', 1)[0], 'static', 'img'])
    #        fileurl = '/'.join([imgdir, filename])
    #        card.append(fileurl)
    #else:
    data = RecursiveDict()
    for idx, (k,v) in enumerate(hdata.get('data', {}).items()):
        data[k] = v
        if idx >= 6:
            break # humans can grasp 7 items quickly
    data = render_dict(data, webapp=True)
    is_mp_id = mp_id_pattern.match(mpid)
    collection = 'materials' if is_mp_id else 'compositions'
    more = reverse('mpcontribs_explorer_contribution', args=[collection, cid])
    card = '''
    <div class="panel panel-default">
        <div class="panel-heading">
            <h4 class="panel-title">
                {}
                <a class="btn-sm btn-default pull-right" role="button"
                   style=" margin-top:-6px;"
                   href="{}" target="_blank">More Info</a>
            </h4>
        </div>
        <div class="panel-body" style="padding-left: 0px">
            <div class="col-md-8" style="padding-top: 0px">
                <blockquote class="blockquote" style="font-size: 13px;">{}</blockquote>
            </div>
            <div class="col-md-4 well" style="padding: 0px 0px 5px 5px;">{}</div>
            <div class="col-md-12" style="padding-right: 0px;">{}</div>
        </div>
    </div>
    <script>
    requirejs(['main'], function() {{
        require(['jquery'], function() {{
            $(function(){{
                $("a.read_more").click(function(event){{
                    event.preventDefault();
                    $(this).parents(".blockquote").find(".more_text").show();
                    $(this).parents(".blockquote").find(".read_more").hide();
                }});
            }});
        }});
    }});
    </script>
    '''.format(
            title, more, description, provenance, data
    )
    return {"valid_response": True, "response": card}
Пример #25
0
def run(mpfile, **kwargs):

    # extract data from json files
    input_dir = mpfile.hdata.general['input_dir']
    for idx, obj in enumerate(scandir(input_dir)):
        mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1]
        print(mpid)
        input_file = gzip.open(obj.path, 'rb')
        try:
            data = json.loads(input_file.read())

            # filter out metals
            if 'GGA' not in data or 'GGA' not in data['gap'] or data['gap']['GGA'] < 0.1:
                print('GGA gap < 0.1 -> skip')
                continue

            # add hierarchical data (nested key-values)
            hdata = RecursiveDict()
            T, lvl, S2 = '300', '1e+18', None
            pf_key = 'S²σ'
            hdata['temperature'] = T + ' K'
            hdata['doping_level'] = lvl + ' cm⁻³'
            variables = [
                {'key': 'cond_eff_mass', 'name': 'mₑᶜᵒⁿᵈ', 'unit': 'mₑ'},
                {'key': 'seebeck_doping', 'name': 'S', 'unit': 'μV/K'},
                {'key': 'cond_doping', 'name': 'σ', 'unit': '(Ωms)⁻¹'},
            ]
            eigs_keys = ['ε₁', 'ε₂', 'ε₃', '<ε>']

            for v in variables:
                hdata[v['name']] = RecursiveDict()
                for doping_type in ['p', 'n']:
                    if doping_type in data['GGA'][v['key']]:
                        d = data['GGA'][v['key']][doping_type][T][lvl]
                        eigs = map(float, d if isinstance(d, list) else d['eigs'])
                        hdata[v['name']][doping_type] = RecursiveDict(
                            (eigs_keys[neig], clean_value(eig, v['unit']))
                            for neig, eig in enumerate(eigs)
                        )
                        hdata[v['name']][doping_type][eigs_keys[-1]] = clean_value(np.mean(eigs), v['unit'])
                        if v['key'] == 'seebeck_doping':
                            S2 = np.dot(d['tensor'], d['tensor'])
                        elif v['key'] == 'cond_doping':
                            pf = np.mean(np.linalg.eigh(np.dot(S2, d['tensor']))[0]) * 1e-8
                            if pf_key not in hdata:
                                hdata[pf_key] = RecursiveDict()
                            hdata[pf_key][doping_type] = {eigs_keys[-1]: clean_value(pf, 'μW/(cmK²s)')}


            mpfile_data = nest_dict(hdata, ['data'])

            # build data and max values for seebeck, conductivity and kappa
            # max/min values computed using numpy. It may be better to code it in pure python.
            keys = ['pretty_formula', 'volume']
            hdata = RecursiveDict((k, data[k]) for k in keys)
            hdata['volume'] = clean_value(hdata['volume'], 'ų')
            hdata['bandgap'] = clean_value(data['gap']['GGA'], 'eV')
            cols = ['value', 'temperature', 'doping']
            tables = RecursiveDict()
            props = RecursiveDict()
            props['seebeck_doping'] = ['S', 'μV/K']
            props['cond_doping'] = ['σ', '(Ωms)⁻¹']
            props['kappa_doping'] = ['κₑ', 'W/(mKs)']

            for prop_name, (lbl, unit) in props.iteritems():
                # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font)
                # and select it as standard font in your browser (leave other fonts as is, esp. fixed width)
                tables[lbl] = RecursiveDict()
                hlbl = lbl+'₋' if len(lbl) > 1 else lbl
                hlbl += 'ₑₓₜᵣ'
                hdata[hlbl] = RecursiveDict()

                for doping_type in ['p', 'n']:
                    prop = data['GGA'][prop_name][doping_type]
                    prop_averages, dopings, columns = [], None, ['T [K]']
                    temps = sorted(map(int, prop.keys()))
                    for temp in temps:
                        row = [temp]
                        if dopings is None:
                            dopings = sorted(map(float, prop[str(temp)].keys()))
                        for doping in dopings:
                            doping_str = '%.0e' % doping
                            if len(columns) <= len(dopings):
                                columns.append('{} cm⁻³ [{}]'.format(doping_str, unit))
                            eigs = prop[str(temp)][doping_str]['eigs']
                            row.append(np.mean(eigs))
                        prop_averages.append((temp, row))

                    tables[lbl][doping_type] = Table.from_items(
                        prop_averages, orient='index', columns=columns
                    )

                    arr_prop_avg = np.array([item[1] for item in prop_averages])[:,1:]
                    max_v = np.max(arr_prop_avg)
                    if prop_name[0] == 's' and doping_type == 'n':
                        max_v = np.min(arr_prop_avg)
                    if prop_name[0] == 'k':
                        max_v = np.min(arr_prop_avg)
                    arg_max = np.argwhere(arr_prop_avg==max_v)[0]

                    vals = [
                        clean_value(max_v, unit),
                        clean_value(temps[arg_max[0]], 'K'),
                        clean_value(dopings[arg_max[1]], 'cm⁻³')
                    ]
                    hdata[hlbl][doping_type] = RecursiveDict(
                        (k, v) for k, v in zip(cols, vals)
                    )

            mpfile_data.rec_update(nest_dict(hdata, ['extra_data']))
            mpfile.add_hierarchical_data(mpfile_data, identifier=data['mp_id'])
            for lbl, dct in tables.iteritems():
                for doping_type, table in dct.iteritems():
                    mpfile.add_data_table(
                        data['mp_id'], table, name='{}({})'.format(lbl, doping_type)
                    )

        finally:
            input_file.close()
Пример #26
0
def run(mpfile, **kwargs):
    # TODO clone solar_perovskite if needed, abort if insufficient permissions
    try:
        import solar_perovskite
        from solar_perovskite.core import GetExpThermo
        from solar_perovskite.init.find_structures import FindStructures
        from solar_perovskite.init.import_data import Importdata
        from solar_perovskite.modelling.from_theo import EnthTheo
    except ImportError:
        print("could not import solar_perovskite, clone github repo")
        sys.exit(1)

    input_files = mpfile.hdata.general["input_files"]
    input_dir = os.path.dirname(solar_perovskite.__file__)
    input_file = os.path.join(input_dir, input_files["exp"])
    exp_table = read_csv(open(input_file, "r").read().replace(";", ","))
    print("exp data loaded.")
    with open(os.path.join(input_dir, input_files["theo"]), "r") as f:
        theo_data = json.loads(f.read()).pop("collection")
    print("theo data loaded.")
    with open(input_files["energy"], "r") as f:
        data = json.load(f).pop("collection")
    print("energy data loaded.")
    l = [
        dict(sdoc, parameters=doc["_id"]) for doc in data
        for sdoc in doc["energy_analysis"]
    ]
    frame = pd.DataFrame(l)
    parameters = frame["parameters"]
    frame.drop(labels=["parameters"], axis=1, inplace=True)
    frame.insert(0, "parameters", parameters)
    print("energy dataframe:", frame.shape)

    mpfile_singles = [m for m in mpfile.split()]
    for mpfile_single in mpfile_singles:
        identifier = mpfile_single.ids[0]
        # if identifier in run.existing_identifiers:
        #    print (not updating', identifier)
        #    continue
        if identifier != "mp-1076585":
            continue
        hdata = mpfile_single.hdata[identifier]
        print(identifier)

        print("add hdata ...")
        d = RecursiveDict()
        d["data"] = RecursiveDict()
        compstr = hdata["pars"]["theo_compstr"]
        row = exp_table.loc[exp_table["theo_compstr"] == compstr]
        if not row.empty:
            sample_number = int(row.iloc[0]["sample_number"])
            d["pars"] = get_fit_pars(sample_number)
            d["data"]["availability"] = "Exp+Theo"
        else:
            d["pars"] = RecursiveDict()
            d["data"]["availability"] = "Theo"
        # print('dh_min, dh_max ...')
        # _, dh_min, dh_max, _ = redenth_act(compstr)
        # d['pars']['dh_min'] = clean_value(dh_min, max_dgts=4)
        # d['pars']['dh_max'] = clean_value(dh_max, max_dgts=4)
        # d['pars']['elastic'] = RecursiveDict()
        # print('debye temps ...')
        # d['pars']['elastic']['debye_temp'] = RecursiveDict()
        # try:
        #    t_d_perov = get_debye_temp(identifier)
        #    t_d_brownm = get_debye_temp(hdata['data']['reduced_phase']['closest-MP'])
        #    tensors_available = 'True'
        # except TypeError:
        #    t_d_perov = get_debye_temp("mp-510624")
        #    t_d_brownm = get_debye_temp("mp-561589")
        #    tensors_available = 'False'
        # d['pars']['elastic']['debye_temp']['perovskite'] = clean_value(t_d_perov, max_dgts=6)
        # d['pars']['elastic']['debye_temp']['brownmillerite'] = clean_value(t_d_brownm, max_dgts=6)
        # d['pars']['elastic']['tensors_available'] = tensors_available
        d["pars"]["last_updated"] = str(datetime.now())
        mpfile_single.add_hierarchical_data(d, identifier=identifier)

        # for process in processes:
        #    if process != "AS":
        #        t_ox_l = t_ox_ws_cs
        #        t_red_l = t_red_ws_cs
        #        p_ox_l = p_ox_ws_cs
        #        p_red_l = p_red_ws_cs
        #        data_source = ["Theo"]
        #    else:
        #        t_ox_l = t_ox_airsep
        #        t_red_l = t_red_airsep
        #        p_ox_l = p_ox_airsep
        #        p_red_l = p_red_airsep
        #        data_source = ["Theo", "Exp"]

        #    for red_temp in t_red_l:
        #        for ox_temp in t_ox_l:
        #            for ox_pr in p_ox_l:
        #                for red_pr in p_red_l:
        #                    for data_sources in data_source:
        #                        db_id = process + "_" + str(float(ox_temp)) + "_" \
        #                                + str(float(red_temp)) + "_" + str(float(ox_pr)) \
        #                                + "_" + str(float(red_pr)) + "_" + data_sources + \
        #                                "_" + str(float(enth_steps))

        print("add energy analysis ...")
        group = frame.query('compstr.str.contains("{}")'.format(compstr[:-1]))
        group.drop(labels="compstr", axis=1, inplace=True)
        for prodstr, subgroup in group.groupby(["prodstr", "prodstr_alt"],
                                               sort=False):
            subgroup.drop(labels=["prodstr", "prodstr_alt"],
                          axis=1,
                          inplace=True)
            for unstable, subsubgroup in subgroup.groupby("unstable",
                                                          sort=False):
                subsubgroup.drop(labels="unstable", axis=1, inplace=True)
                name = "energy-analysis_{}_{}".format(
                    "unstable" if unstable else "stable", "-".join(prodstr))
                print(name)
                mpfile_single.add_data_table(identifier, subsubgroup, name)

        print(mpfile_single)
        mpfile.concat(mpfile_single)
        break

        if not row.empty:
            print("add ΔH ...")
            exp_thermo = GetExpThermo(sample_number, plotting=False)
            enthalpy = exp_thermo.exp_dh()
            table = get_table(enthalpy, "H")
            mpfile_single.add_data_table(identifier, table, name="enthalpy")

            print("add ΔS ...")
            entropy = exp_thermo.exp_ds()
            table = get_table(entropy, "S")
            mpfile_single.add_data_table(identifier, table, name="entropy")

            print("add raw data ...")
            tga_results = os.path.join(
                os.path.dirname(solar_perovskite.__file__), "tga_results")
            for path in glob(
                    os.path.join(
                        tga_results,
                        "ExpDat_JV_P_{}_*.csv".format(sample_number))):
                print(
                    path.split("_{}_".format(sample_number))[-1].split(".")[0],
                    "...")
                body = open(path, "r").read()
                cols = ["Time [min]", "Temperature [C]", "dm [%]", "pO2"]
                table = read_csv(body,
                                 lineterminator=os.linesep,
                                 usecols=cols,
                                 skiprows=5)
                table = table[cols].iloc[::100, :]
                # scale/shift for better graphs
                T, dm, p = [pd.to_numeric(table[col]) for col in cols[1:]]
                T_min, T_max, dm_min, dm_max, p_max = (
                    T.min(),
                    T.max(),
                    dm.min(),
                    dm.max(),
                    p.max(),
                )
                rT, rdm = abs(T_max - T_min), abs(dm_max - dm_min)
                table[cols[2]] = (dm - dm_min) * rT / rdm
                table[cols[3]] = p * rT / p_max
                table.rename(
                    columns={
                        "dm [%]":
                        "(dm [%] + {:.4g}) * {:.4g}".format(-dm_min, rT / rdm),
                        "pO2":
                        "pO₂ * {:.4g}".format(rT / p_max),
                    },
                    inplace=True,
                )
                mpfile_single.add_data_table(identifier, table, name="raw")
Пример #27
0
 def pop_first_section(self):
     item = self.document.popitem(last=False)
     return self.from_dict(RecursiveDict([item]))
def run(mpfile, **kwargs):

    # extract data from json files
    input_dir = mpfile.hdata.general['input_dir']
    for idx, obj in enumerate(scandir(input_dir)):
        mpid = obj.name.split('.', 1)[0].rsplit('_', 1)[1]
        print(mpid)
        input_file = gzip.open(obj.path, 'rb')
        try:
            data = json.loads(input_file.read())

            # filter out metals
            if 'GGA' not in data or 'GGA' not in data[
                    'gap'] or data['gap']['GGA'] < 0.1:
                print('GGA gap < 0.1 -> skip')
                continue

            # add hierarchical data (nested key-values)
            hdata = RecursiveDict()
            T, lvl, S2 = '300', '1e+18', None
            pf_key = 'S²σ'
            hdata['temperature'] = T + ' K'
            hdata['doping_level'] = lvl + ' cm⁻³'
            variables = [
                {
                    'key': 'cond_eff_mass',
                    'name': 'mₑᶜᵒⁿᵈ',
                    'unit': 'mₑ'
                },
                {
                    'key': 'seebeck_doping',
                    'name': 'S',
                    'unit': 'μV/K'
                },
                {
                    'key': 'cond_doping',
                    'name': 'σ',
                    'unit': '(Ωms)⁻¹'
                },
            ]
            eigs_keys = ['ε₁', 'ε₂', 'ε₃', '<ε>']

            for v in variables:
                hdata[v['name']] = RecursiveDict()
                for doping_type in ['p', 'n']:
                    if doping_type in data['GGA'][v['key']]:
                        d = data['GGA'][v['key']][doping_type][T][lvl]
                        eigs = map(float,
                                   d if isinstance(d, list) else d['eigs'])
                        hdata[v['name']][doping_type] = RecursiveDict(
                            (eigs_keys[neig], clean_value(eig, v['unit']))
                            for neig, eig in enumerate(eigs))
                        hdata[v['name']][doping_type][
                            eigs_keys[-1]] = clean_value(
                                np.mean(eigs), v['unit'])
                        if v['key'] == 'seebeck_doping':
                            S2 = np.dot(d['tensor'], d['tensor'])
                        elif v['key'] == 'cond_doping':
                            pf = np.mean(
                                np.linalg.eigh(np.dot(S2,
                                                      d['tensor']))[0]) * 1e-8
                            if pf_key not in hdata:
                                hdata[pf_key] = RecursiveDict()
                            hdata[pf_key][doping_type] = {
                                eigs_keys[-1]: clean_value(pf, 'μW/(cmK²s)')
                            }

            mpfile_data = nest_dict(hdata, ['data'])

            # build data and max values for seebeck, conductivity and kappa
            # max/min values computed using numpy. It may be better to code it in pure python.
            keys = ['pretty_formula', 'volume']
            hdata = RecursiveDict((k, data[k]) for k in keys)
            hdata['volume'] = clean_value(hdata['volume'], 'ų')
            hdata['bandgap'] = clean_value(data['gap']['GGA'], 'eV')
            cols = ['value', 'temperature', 'doping']
            tables = RecursiveDict()
            props = RecursiveDict()
            props['seebeck_doping'] = ['S', 'μV/K']
            props['cond_doping'] = ['σ', '(Ωms)⁻¹']
            props['kappa_doping'] = ['κₑ', 'W/(mKs)']

            for prop_name, (lbl, unit) in props.iteritems():
                # TODO install Symbola font if you see squares here (https://fonts2u.com/symbola.font)
                # and select it as standard font in your browser (leave other fonts as is, esp. fixed width)
                tables[lbl] = RecursiveDict()
                hlbl = lbl + '₋' if len(lbl) > 1 else lbl
                hlbl += 'ₑₓₜᵣ'
                hdata[hlbl] = RecursiveDict()

                for doping_type in ['p', 'n']:
                    prop = data['GGA'][prop_name][doping_type]
                    prop_averages, dopings, columns = [], None, ['T [K]']
                    temps = sorted(map(int, prop.keys()))
                    for temp in temps:
                        row = [temp]
                        if dopings is None:
                            dopings = sorted(map(float,
                                                 prop[str(temp)].keys()))
                        for doping in dopings:
                            doping_str = '%.0e' % doping
                            if len(columns) <= len(dopings):
                                columns.append('{} cm⁻³ [{}]'.format(
                                    doping_str, unit))
                            eigs = prop[str(temp)][doping_str]['eigs']
                            row.append(np.mean(eigs))
                        prop_averages.append((temp, row))

                    tables[lbl][doping_type] = Table.from_items(
                        prop_averages, orient='index', columns=columns)

                    arr_prop_avg = np.array(
                        [item[1] for item in prop_averages])[:, 1:]
                    max_v = np.max(arr_prop_avg)
                    if prop_name[0] == 's' and doping_type == 'n':
                        max_v = np.min(arr_prop_avg)
                    if prop_name[0] == 'k':
                        max_v = np.min(arr_prop_avg)
                    arg_max = np.argwhere(arr_prop_avg == max_v)[0]

                    vals = [
                        clean_value(max_v, unit),
                        clean_value(temps[arg_max[0]], 'K'),
                        clean_value(dopings[arg_max[1]], 'cm⁻³')
                    ]
                    hdata[hlbl][doping_type] = RecursiveDict(
                        (k, v) for k, v in zip(cols, vals))

            mpfile_data.rec_update(nest_dict(hdata, ['extra_data']))
            mpfile.add_hierarchical_data(mpfile_data, identifier=data['mp_id'])
            for lbl, dct in tables.iteritems():
                for doping_type, table in dct.iteritems():
                    mpfile.add_data_table(data['mp_id'],
                                          table,
                                          name='{}({})'.format(
                                              lbl, doping_type))

        finally:
            input_file.close()
Пример #29
0
 def from_dict(cls, data=RecursiveDict()):
     return cls(data=data)
Пример #30
0
 def from_string(data):
     # use archieml-python parse to import data
     rdct = RecursiveDict(archieml.loads(data))
     rdct.rec_update()
     # post-process internal representation of file contents
     for key in rdct.keys():
         is_general, root_key = normalize_root_level(key)
         if is_general:
             # make part of shared (meta-)data, i.e. nest under `general` at
             # the beginning of the MPFile
             if mp_level01_titles[0] not in rdct:
                 rdct.insert_before(rdct.keys()[0],
                                    (mp_level01_titles[0], RecursiveDict()))
             rdct.rec_update(
                 nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key]))
         else:
             # normalize identifier key (pop & insert)
             # using rec_update since we're looping over all entries
             # also: support data in bare tables (marked-up only by
             #       root-level identifier) by nesting under 'data'
             value = rdct.pop(key)
             keys = [root_key]
             if isinstance(value, list): keys.append('table')
             rdct.rec_update(nest_dict(value, keys))
             # Note: CSV section is marked with 'data ' prefix during iterate()
             for k, v in rdct[root_key].iterate():
                 if isinstance(k, six.string_types) and \
                    k.startswith(mp_level01_titles[1]):
                     # k = table name (incl. data prefix)
                     # v = csv string from ArchieML free-form arrays
                     table_name = k[len(mp_level01_titles[1] + '_'):]
                     pd_obj = read_csv(v)
                     rdct[root_key].pop(table_name)
                     rdct[root_key].rec_update(
                         nest_dict(pd_obj.to_dict(), [k]))
                     rdct[root_key].insert_default_plot_options(pd_obj, k)
             # convert CIF strings into pymatgen structures
             if mp_level01_titles[3] in rdct[root_key]:
                 from pymatgen.io.cif import CifParser
                 for name in rdct[root_key][mp_level01_titles[3]].keys():
                     cif = rdct[root_key][mp_level01_titles[3]].pop(name)
                     parser = CifParser.from_string(cif)
                     structure = parser.get_structures(primitive=False)[0]
                     rdct[root_key][mp_level01_titles[3]].rec_update(
                         nest_dict(structure.as_dict(), [name]))
     return MPFile.from_dict(rdct)
Пример #31
0
def run(mpfile, **kwargs):
    from pymatgen import Structure

    reference_project = None
    input_data, input_keys, extra = RecursiveDict(), RecursiveDict(), RecursiveDict()
    input_urls = mpfile.document['_hdata'].pop('input_urls')

    for project in input_urls:
        input_url = input_urls[project]['file']
        if '{}' in input_url:
            input_url = input_url.format('2d') # TODO 3d for Jarvis

        dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1])
        if not os.path.exists(dbfile):
            print 'downloading', dbfile, '...'
            urllib.urlretrieve(input_url, dbfile)

        ext = os.path.splitext(dbfile)[1]
        is_nus = bool(ext == '.gz')
        id_key = 'parent_id' if is_nus else 'mpid'
        if not is_nus:
            with tarfile.open(dbfile, "r:gz") as tar:
                member = tar.getmembers()[0]
                raw_data = json.load(tar.extractfile(member), cls=MontyDecoder)
        else:
            reference_project = project
            raw_data = []
            with gzip.open(dbfile, 'rb') as f:
                for line in f:
                    raw_data.append(json.loads(line, cls=MontyDecoder))
        input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data)

        input_keys[project] = [
            'material_id', 'exfoliation_energy_per_atom', 'structure'
        ] if is_nus else ['jid', 'exfoliation_en', 'final_str']
        extra[project] = [
            ('fin_en', ('E', 'eV')),
            ('op_gap', ('ΔE|optB88vdW', 'eV')),
            ('mbj_gap', ('ΔE|mbj', 'eV')),
            #('kv', ('Kᵥ', 'GPa')),
            #('gv', ('Gᵥ', 'GPa'))
        ] if not is_nus else []

        print len(input_data[project]), 'materials loaded for', project

    projects = input_data.keys()
    identifiers = []
    for d in input_data.values():
        identifiers += list(d.keys())

    for identifier in identifiers:
        data, structures = RecursiveDict(), RecursiveDict()

        for project in projects:
            if project not in data:
                data[project] = RecursiveDict()
            if identifier in input_data[project]:
                d = input_data[project][identifier]
                structures[project] = d[input_keys[project][-1]]
                if data.get('formula') is None:
                    data['formula'] = get_composition_from_string(
                        structures[project].composition.reduced_formula
                    )
                data[project]['id'] = input_urls[project]['detail'].format(d[input_keys[project][0]])
                Ex = d[input_keys[project][1]]
                if project == reference_project:
                    Ex *= 1000.
                data[project]['Eₓ'] = clean_value(Ex, 'eV')
                for k, (sym, unit) in extra[project]:
                    if d[k] != 'na':
                        data[project][sym] = clean_value(d[k], unit)

        mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier)
        for project, structure in structures.items():
            name = '{}_{}'.format(data['formula'], project)
            try:
                mpfile.add_structure(structure, name=name, identifier=identifier)
            except Exception as ex:
                print str(ex)
Пример #32
0
def run(mpfile, **kwargs):
    from pymatgen import Structure

    reference_project = None
    input_data, input_keys, extra = RecursiveDict(), RecursiveDict(
    ), RecursiveDict()
    input_urls = mpfile.document['_hdata'].pop('input_urls')

    for project in input_urls:
        input_url = input_urls[project]['file']
        if '{}' in input_url:
            input_url = input_url.format('2d')  # TODO 3d for Jarvis

        dbfile = os.path.join(os.environ['HOME'], 'work',
                              input_url.rsplit('/')[-1])
        if not os.path.exists(dbfile):
            print 'downloading', dbfile, '...'
            urllib.urlretrieve(input_url, dbfile)

        ext = os.path.splitext(dbfile)[1]
        is_nus = bool(ext == '.gz')
        id_key = 'parent_id' if is_nus else 'mpid'
        if not is_nus:
            with tarfile.open(dbfile, "r:gz") as tar:
                member = tar.getmembers()[0]
                raw_data = json.load(tar.extractfile(member), cls=MontyDecoder)
        else:
            reference_project = project
            raw_data = []
            with gzip.open(dbfile, 'rb') as f:
                for line in f:
                    raw_data.append(json.loads(line, cls=MontyDecoder))
        input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data)

        input_keys[project] = [
            'material_id', 'exfoliation_energy_per_atom', 'structure'
        ] if is_nus else ['jid', 'exfoliation_en', 'final_str']
        extra[project] = [
            ('fin_en', ('E', 'meV/atom')),
            ('op_gap', ('ΔE|optB88vdW', 'meV/atom')),
            ('mbj_gap', ('ΔE|mbj', 'meV/atom')),
            #('kv', ('Kᵥ', 'GPa')),
            #('gv', ('Gᵥ', 'GPa'))
        ] if not is_nus else []

        print len(input_data[project]), 'materials loaded for', project

    projects = input_data.keys()
    identifiers = []
    for d in input_data.values():
        identifiers += list(d.keys())

    for identifier in identifiers:
        data, structures = RecursiveDict(), RecursiveDict()

        for project in projects:
            if project not in data:
                data[project] = RecursiveDict()
            if identifier in input_data[project]:
                d = input_data[project][identifier]
                structures[project] = d[input_keys[project][-1]]
                if data.get('formula') is None:
                    data['formula'] = get_composition_from_string(
                        structures[project].composition.reduced_formula)
                data[project]['id'] = input_urls[project]['detail'].format(
                    d[input_keys[project][0]])
                Ex = d[input_keys[project][1]]
                if project == reference_project:
                    Ex *= 1000.
                data[project]['Eₓ'] = clean_value(Ex, 'eV')
                for k, (sym, unit) in extra[project]:
                    if d[k] != 'na':
                        data[project][sym] = clean_value(d[k], unit)

        mpfile.add_hierarchical_data(nest_dict(data, ['data']),
                                     identifier=identifier)
        for project, structure in structures.items():
            name = '{}_{}'.format(data['formula'], project)
            try:
                mpfile.add_structure(structure,
                                     name=name,
                                     identifier=identifier)
            except Exception as ex:
                print str(ex)