示例#1
0
 def from_string(data):
     # use archieml-python parse to import data
     rdct = RecursiveDict(archieml.loads(data))
     rdct.rec_update()
     # post-process internal representation of file contents
     for key in rdct.keys():
         is_general, root_key = normalize_root_level(key)
         if is_general:
             # make part of shared (meta-)data, i.e. nest under `general` at
             # the beginning of the MPFile
             if mp_level01_titles[0] not in rdct:
                 rdct.insert_before(rdct.keys()[0],
                                    (mp_level01_titles[0], RecursiveDict()))
             rdct.rec_update(
                 nest_dict(rdct.pop(key), [mp_level01_titles[0], root_key]))
         else:
             # normalize identifier key (pop & insert)
             # using rec_update since we're looping over all entries
             # also: support data in bare tables (marked-up only by
             #       root-level identifier) by nesting under 'data'
             value = rdct.pop(key)
             keys = [root_key]
             if isinstance(value, list): keys.append('table')
             rdct.rec_update(nest_dict(value, keys))
             # Note: CSV section is marked with 'data ' prefix during iterate()
             for k, v in rdct[root_key].iterate():
                 if isinstance(k, six.string_types) and \
                    k.startswith(mp_level01_titles[1]):
                     # k = table name (incl. data prefix)
                     # v = csv string from ArchieML free-form arrays
                     table_name = k[len(mp_level01_titles[1] + '_'):]
                     pd_obj = read_csv(v)
                     rdct[root_key].pop(table_name)
                     rdct[root_key].rec_update(
                         nest_dict(pd_obj.to_dict(), [k]))
                     rdct[root_key].insert_default_plot_options(pd_obj, k)
             # convert CIF strings into pymatgen structures
             if mp_level01_titles[3] in rdct[root_key]:
                 from pymatgen.io.cif import CifParser
                 for name in rdct[root_key][mp_level01_titles[3]].keys():
                     cif = rdct[root_key][mp_level01_titles[3]].pop(name)
                     parser = CifParser.from_string(cif)
                     structure = parser.get_structures(primitive=False)[0]
                     rdct[root_key][mp_level01_titles[3]].rec_update(
                         nest_dict(structure.as_dict(), [name]))
     return MPFile.from_dict(rdct)
示例#2
0
def run(mpfile, **kwargs):
    from pymatgen import Structure

    reference_project = None
    input_data, input_keys, extra = RecursiveDict(), RecursiveDict(
    ), RecursiveDict()
    #input_urls = mpfile.document['_hdata'].pop('input_urls')
    input_urls = {
        'NUS': {
            "file": "http://www.2dmatpedia.org/static/db.json.gz",
            "detail": "http://www.2dmatpedia.org/2dmaterials/doc/{}"
        },
        'JARVIS': {
            "file": "https://www.ctcms.nist.gov/~knc6/jdft_{}.json.tgz",
            "detail": "https://www.ctcms.nist.gov/~knc6/jsmol/{}.html"
        }
    }

    for project in input_urls:
        input_url = input_urls[project]['file']
        if '{}' in input_url:
            input_url = input_url.format('2d')  # TODO 3d for Jarvis

        #dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1])
        dbfile = input_url.rsplit('/')[-1]
        if not os.path.exists(dbfile):
            print('downloading', dbfile, '...')
            urllib.request.urlretrieve(input_url, dbfile)

        ext = os.path.splitext(dbfile)[1]
        is_nus = bool(ext == '.gz')
        id_key = 'source_id' if is_nus else 'mpid'
        if not is_nus:
            with tarfile.open(dbfile, "r:gz") as tar:
                member = tar.getmembers()[0]
                raw_data = json.load(tar.extractfile(member), cls=MontyDecoder)
        else:
            reference_project = project
            raw_data = []
            with gzip.open(dbfile, 'rb') as f:
                for line in f:
                    raw_data.append(json.loads(line, cls=MontyDecoder))
        input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data)

        input_keys[project] = [
            'material_id', 'exfoliation_energy_per_atom', 'structure'
        ] if is_nus else ['jid', 'exfoliation_en', 'final_str']
        extra[project] = [
            ('fin_en', ('E', 'meV/atom')),
            ('op_gap', ('ΔE|optB88vdW', 'meV/atom')),
            ('mbj_gap', ('ΔE|mbj', 'meV/atom')),
            #('kv', ('Kᵥ', 'GPa')),
            #('gv', ('Gᵥ', 'GPa'))
        ] if not is_nus else []

        print(len(input_data[project]), 'materials loaded for', project)

    projects = input_data.keys()
    identifiers = []
    for d in input_data.values():
        identifiers += list(d.keys())

    for identifier in set(identifiers):
        print(identifier)
        data, structures = RecursiveDict(), RecursiveDict()

        for project in projects:
            if project not in data:
                data[project] = RecursiveDict()
            if identifier in input_data[project]:
                d = input_data[project][identifier]
                structures[project] = d[input_keys[project][-1]]
                if data.get('formula') is None:
                    data['formula'] = get_composition_from_string(
                        structures[project].composition.reduced_formula)
                data[project]['id'] = input_urls[project]['detail'].format(
                    d[input_keys[project][0]])
                if input_keys[project][1] in d:
                    Ex = d[input_keys[project][1]]
                    if project == reference_project:
                        Ex *= 1000.
                    data[project]['Eₓ'] = clean_value(Ex, 'eV')
                for k, (sym, unit) in extra[project]:
                    if d[k] != 'na':
                        data[project][sym] = clean_value(d[k], unit)

        mpfile.add_hierarchical_data(nest_dict(data, ['data']),
                                     identifier=identifier)
        #r = db.contributions.update_one(
        #    {'identifier': identifier, 'project': 'jarvis_dft'},
        #    {'$set': {'content.data': mpfile.document[identifier]['data']}},
        #    upsert=True
        #)
        #print(r.matched_count, r.modified_count, r.upserted_id)

        doc = db.contributions.find_one(
            {
                'identifier': identifier,
                'project': 'jarvis_dft'
            }, {
                '_id': 1,
                'content.structures': 1
            })
        if 'structures' in doc['content']:
            print('structures already added for', identifier)
            continue
        print(doc['_id'])

        inserted_ids = []
        for project, structure in structures.items():
            try:
                mpfile.add_structure(structure,
                                     name=project,
                                     identifier=identifier)
                sdct = mpfile.document[identifier]['structures'][project]
                sdct.pop('@module')
                sdct.pop('@class')
                if sdct['charge'] is None:
                    sdct.pop('charge')
                sdct['identifier'] = identifier
                sdct['project'] = 'jarvis_dft'
                sdct['name'] = project
                sdct['cid'] = doc['_id']
                r = db.structures.insert_one(sdct)
                inserted_ids.append(r.inserted_id)
            except Exception as ex:
                print(str(ex))

        print(inserted_ids)
        r = db.contributions.update_one(
            {'_id': doc['_id']},
            {'$set': {
                'content.structures': inserted_ids
            }})
        print(r.matched_count, r.modified_count)
def run(mpfile, **kwargs):
    from pymatgen import Structure

    reference_project = None
    input_data, input_keys, extra = RecursiveDict(), RecursiveDict(), RecursiveDict()
    input_urls = mpfile.document['_hdata'].pop('input_urls')

    for project in input_urls:
        input_url = input_urls[project]['file']
        if '{}' in input_url:
            input_url = input_url.format('2d') # TODO 3d for Jarvis

        dbfile = os.path.join(os.environ['HOME'], 'work', input_url.rsplit('/')[-1])
        if not os.path.exists(dbfile):
            print 'downloading', dbfile, '...'
            urllib.urlretrieve(input_url, dbfile)

        ext = os.path.splitext(dbfile)[1]
        is_nus = bool(ext == '.gz')
        id_key = 'parent_id' if is_nus else 'mpid'
        if not is_nus:
            with tarfile.open(dbfile, "r:gz") as tar:
                member = tar.getmembers()[0]
                raw_data = json.load(tar.extractfile(member), cls=MontyDecoder)
        else:
            reference_project = project
            raw_data = []
            with gzip.open(dbfile, 'rb') as f:
                for line in f:
                    raw_data.append(json.loads(line, cls=MontyDecoder))
        input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data)

        input_keys[project] = [
            'material_id', 'exfoliation_energy_per_atom', 'structure'
        ] if is_nus else ['jid', 'exfoliation_en', 'final_str']
        extra[project] = [
            ('fin_en', ('E', 'eV')),
            ('op_gap', ('ΔE|optB88vdW', 'eV')),
            ('mbj_gap', ('ΔE|mbj', 'eV')),
            #('kv', ('Kᵥ', 'GPa')),
            #('gv', ('Gᵥ', 'GPa'))
        ] if not is_nus else []

        print len(input_data[project]), 'materials loaded for', project

    projects = input_data.keys()
    identifiers = []
    for d in input_data.values():
        identifiers += list(d.keys())

    for identifier in identifiers:
        data, structures = RecursiveDict(), RecursiveDict()

        for project in projects:
            if project not in data:
                data[project] = RecursiveDict()
            if identifier in input_data[project]:
                d = input_data[project][identifier]
                structures[project] = d[input_keys[project][-1]]
                if data.get('formula') is None:
                    data['formula'] = get_composition_from_string(
                        structures[project].composition.reduced_formula
                    )
                data[project]['id'] = input_urls[project]['detail'].format(d[input_keys[project][0]])
                Ex = d[input_keys[project][1]]
                if project == reference_project:
                    Ex *= 1000.
                data[project]['Eₓ'] = clean_value(Ex, 'eV')
                for k, (sym, unit) in extra[project]:
                    if d[k] != 'na':
                        data[project][sym] = clean_value(d[k], unit)

        mpfile.add_hierarchical_data(nest_dict(data, ['data']), identifier=identifier)
        for project, structure in structures.items():
            name = '{}_{}'.format(data['formula'], project)
            try:
                mpfile.add_structure(structure, name=name, identifier=identifier)
            except Exception as ex:
                print str(ex)
示例#4
0
class MPFileCore(six.with_metaclass(ABCMeta, object)):
    """Abstract Base Class for representing a MP Contribution File"""
    def __init__(self, data=RecursiveDict()):
        if isinstance(data, dict):
            self.document = RecursiveDict(data)
        else:
            raise ValueError("Need dict (or inherited class) to init MPFile.")
        self.document.rec_update(
        )  # convert (most) OrderedDict's to RecursiveDict's
        self.unique_mp_cat_ids = True
        self.max_contribs = 10

    def __getitem__(self, key):
        item = self.from_dict({key: self.document[key]})
        general = self.document.get(mp_level01_titles[0])
        if general:
            item.insert_general_section(
                self.from_dict({mp_level01_titles[0]: general}))
        return item

    @property
    def ids(self):
        return [
            k for k in self.document.keys()
            if k.lower() != mp_level01_titles[0]
        ]

    @property
    def hdata(self):
        return HierarchicalData(self.document)

    @property
    def tdata(self):
        return TabularData(self.document)

    @property
    def gdata(self):
        return GraphicalData(self.document)

    @property
    def sdata(self):
        return StructuralData(self.document)

    @classmethod
    def from_file(cls,
                  filename_or_file=default_mpfile_path.replace(
                      ".txt", "_in.txt")):
        """Reads a MPFile from a file.

        Args:
            filename_or_file (str or file): name of file or file containing contribution data.

        Returns:
            MPFile object.
        """
        f = (open(filename_or_file) if isinstance(
            filename_or_file, six.string_types) else filename_or_file)
        return cls.from_string(f.read())

    @classmethod
    def from_dict(cls, data=RecursiveDict()):
        return cls(data=data)

    @classmethod
    def from_contribution(cls, contrib):
        """construct MPFile from contribution (see rest.adapter.submit_contribution)"""
        if "identifier" not in contrib or "content" not in contrib:
            raise ValueError("Dict not in contribution-style format")
        recdict = RecursiveDict({contrib["identifier"]: contrib["content"]})
        return cls.from_dict(recdict)

    def write_file(self,
                   filename=default_mpfile_path.replace(".txt", "_out.txt"),
                   **kwargs):
        """Writes MPFile to a file. The supported kwargs are the same as those
        for the MPFile.get_string method and are passed through directly."""
        with codecs.open(filename, encoding="utf-8", mode="w") as f:
            file_str = self.get_string(**kwargs) + "\n"
            f.write(file_str)
            print("{} ({:.3f}MB) written".format(
                filename,
                os.path.getsize(filename) / 1024.0 / 1024.0))

    def get_number_of_lines(self, **kwargs):
        return len(self.get_string(**kwargs).split("\n"))

    def split(self):
        general_mpfile = (self.pop_first_section() if mp_level01_titles[0]
                          in self.document.keys() else None)
        if not self.document:
            raise ValueError("No contributions in MPFile! Either the file is"
                             " empty or only contains shared (meta-)data not"
                             " correlated to core identifier.")
        while True:
            try:
                mpfile_single = self.pop_first_section()
                mpid_orig = mpfile_single.ids[0]
                if "--" in mpid_orig:
                    mpid = mpid_orig.split("--")[0]
                    mpfile_single.document.rec_update(
                        nest_dict(mpfile_single.document.pop(mpid_orig),
                                  [mpid]))
                if general_mpfile is not None:
                    mpfile_single.insert_general_section(general_mpfile)
                yield mpfile_single
            except KeyError:
                break

    def get_identifiers(self):
        """list of materials/composition identifiers as tuples w/ contribution IDs"""
        return [(k, self.document[k].get("cid", None)) for k in self.document
                if k.lower() != mp_level01_titles[0]]

    def pop_first_section(self):
        item = self.document.popitem(last=False)
        return self.from_dict(RecursiveDict([item]))

    def insert_general_section(self, general_mpfile):
        """insert general section from `general_mpfile` into this MPFile"""
        if general_mpfile is None:
            return
        general_title = mp_level01_titles[0]
        general_data = general_mpfile.document[general_title]
        root_key = list(self.document.keys())[0]
        for key, value in general_data.items():
            if key in self.document[root_key]:
                self.document.rec_update(nest_dict(value, [root_key, key]))
            else:
                self.document[root_key][key] = value
        for key in reversed(general_data.keys()):
            self.document[root_key].move_to_end(key, last=False)

    def get_unique_mp_cat_id(self, mp_cat_id):
        if not self.unique_mp_cat_ids or mp_cat_id in mp_level01_titles:
            return mp_cat_id
        mp_cat_id_idx = len([i for i in self.ids if i.startswith(mp_cat_id)])
        if mp_cat_id_idx == 0:
            return mp_cat_id
        return "{}--{}".format(mp_cat_id, mp_cat_id_idx)

    def concat(self, mpfile):
        """concatenate single-section MPFile with this MPFile"""
        try:
            if len(mpfile.document) > 1:
                raise ValueError(
                    "concatenation only possible with single section files")
        except AttributeError:
            raise ValueError("Provide a MPFile to concatenate")
        mp_cat_id = list(mpfile.document.keys())[0]
        general_title = mp_level01_titles[0]
        if general_title in mpfile.document[mp_cat_id]:
            general_data = mpfile.document[mp_cat_id].pop(general_title)
            if general_title not in self.document:
                self.document.rec_update(
                    nest_dict(general_data, [general_title]))
        self.document.rec_update(
            nest_dict(mpfile.document.pop(mp_cat_id),
                      [self.get_unique_mp_cat_id(mp_cat_id)]))

    def insert_top(self, mp_cat_id, key, value):
        """insert value for `mp_cat_id` as `key: <value>` at top"""
        self.document[mp_cat_id][key] = str(value)
        self.document[mp_cat_id].move_to_end(key, last=False)

    def add_data_table(self, identifier, dataframe, name, plot_options=None):
        """add a datatable to the root-level section

        Args:
            identifier (str): MP category ID (`mp_cat_id`)
            dataframe (pandas.DataFrame): tabular data as Pandas DataFrame
            name (str): table name, optional if only one table in section
            plot_options (dict): options for according plotly graph
        """
        # TODO: optional table name, required if multiple tables per root-level section
        name = "".join([replacements.get(c, c) for c in name])
        self.document.rec_update(
            nest_dict(Table(dataframe).to_dict(), [identifier, name]))
        self.document[identifier].insert_default_plot_options(
            dataframe, name, update_plot_options=plot_options)

    def add_hierarchical_data(self, dct, identifier=mp_level01_titles[0]):
        if len(self.ids) >= self.max_contribs:
            raise StopIteration(
                "Reached max. number of contributions in MPFile")
        self.document.rec_update(nest_dict(RecursiveDict(dct), [identifier]))

    def add_structure(self, source, name=None, identifier=None, fmt=None):
        """add a structure to the mpfile"""
        from pymatgen import Structure, MPRester

        if isinstance(source, Structure):
            structure = source
        elif isinstance(source, dict):
            structure = Structure.from_dict(source)
        elif os.path.exists(source):
            structure = Structure.from_file(source, sort=True)
        elif isinstance(source, six.string_types):
            if fmt is None:
                raise ValueError("Need fmt to get structure from string!")
            structure = Structure.from_str(source, fmt, sort=True)
        else:
            raise ValueError(source, "not supported!")

        if name is not None:
            if not isinstance(name, six.string_types):
                raise ValueError("structure name needs to be a string")
            elif "." in name:
                raise ValueError("structure name cannot contain dots (.)")

        mpr = MPRester()
        if not mpr.api_key:
            raise ValueError(
                "API key not set. Run `pmg config --add PMG_MAPI_KEY <USER_API_KEY>`."
            )
        matched_mpids = mpr.find_structure(structure)
        formula = get_composition_from_string(structure.composition.formula)
        if not matched_mpids:
            if identifier is None:
                identifier = formula
                print(
                    "Structure not found in MP! Please submit via MPComplete to "
                    "obtain mp-id or manually choose an anchor mp-id! Continuing "
                    "with {} as identifier!".format(identifier))
            else:
                print("Structure not found in MP! Forcing {} as identifier!".
                      format(identifier))
        elif identifier is None:
            identifier = matched_mpids[0]
            if len(matched_mpids) > 1:
                print("Multiple matching structures found in MP. Using",
                      identifier)
        elif identifier not in matched_mpids:
            msg = "Structure does not match {} but instead {}!".format(
                identifier, matched_mpids)
            raise ValueError(msg)

        idx = len(
            self.document.get(identifier, {}).get(mp_level01_titles[3], {}))
        sub_key = formula if name is None else name
        if sub_key in self.document.get(identifier,
                                        {}).get(mp_level01_titles[3], {}):
            sub_key += "_{}".format(idx)
        self.document.rec_update(
            nest_dict(structure.as_dict(),
                      [identifier, mp_level01_titles[3], sub_key]))
        return identifier

    def __repr__(self):
        return self.get_string(df_head_only=True)

    def __str__(self):
        return self.get_string(df_head_only=True)

    def _ipython_display_(self):
        from IPython.display import display_html

        display_html(self.hdata)
        display_html(self.tdata)
        display_html(self.gdata)
        display_html(self.sdata)

    # ----------------------------
    # Override these in subclasses
    # ----------------------------

    @staticmethod
    def from_string(data):
        """Reads a MPFile from a string containing contribution data."""
        return MPFileCore()

    def get_string(self, df_head_only=False):
        """Returns a string to be written as a file"""
        return repr(self.document)
示例#5
0
    def from_string(data):
        # use archieml-python parse to import data
        rdct = RecursiveDict(loads(data))
        rdct.rec_update()

        # post-process internal representation of file contents
        for key in list(rdct.keys()):
            is_general, root_key = normalize_root_level(key)

            if is_general:
                # make part of shared (meta-)data, i.e. nest under `general` at
                # the beginning of the MPFile
                if mp_level01_titles[0] not in rdct:
                    rdct[mp_level01_titles[0]] = RecursiveDict()
                    rdct.move_to_end(mp_level01_titles[0], last=False)

            # normalize identifier key (pop & insert)
            # using rec_update since we're looping over all entries
            # also: support data in bare tables (marked-up only by
            #       root-level identifier) by nesting under 'data'
            value = rdct.pop(key)
            keys = [mp_level01_titles[0]] if is_general else []
            keys.append(root_key)
            if isinstance(value, list):
                keys.append("table")
            rdct.rec_update(nest_dict(value, keys))

            # reference to section to iterate or parse as CIF
            section = (rdct[mp_level01_titles[0]][root_key]
                       if is_general else rdct[root_key])

            # iterate to find CSV sections to parse
            # also parse propnet quantities
            if isinstance(section, dict):
                scope = []
                for k, v in section.iterate():
                    level, key = k
                    key = "".join([replacements.get(c, c) for c in key])
                    level_reduction = bool(level < len(scope))
                    if level_reduction:
                        del scope[level:]
                    if v is None:
                        scope.append(key)
                    elif isinstance(v, list) and isinstance(v[0], dict):
                        table = ""
                        for row_dct in v:
                            table = "\n".join([table, row_dct["value"]])
                        pd_obj = read_csv(table)
                        d = nest_dict(pd_obj.to_dict(), scope + [key])
                        section.rec_update(d, overwrite=True)
                        if not is_general and level == 0:
                            section.insert_default_plot_options(pd_obj, key)
                    elif (Quantity is not None
                          and isinstance(v, six.string_types) and " " in v):
                        quantity = Quantity.from_key_value(key, v)
                        d = nest_dict(quantity.as_dict(), scope +
                                      [key])  # TODO quantity.symbol.name
                        section.rec_update(d, overwrite=True)

            # convert CIF strings into pymatgen structures
            if mp_level01_titles[3] in section:
                from pymatgen.io.cif import CifParser

                for name in section[mp_level01_titles[3]].keys():
                    cif = section[mp_level01_titles[3]].pop(name)
                    parser = CifParser.from_string(cif)
                    structure = parser.get_structures(primitive=False)[0]
                    section[mp_level01_titles[3]].rec_update(
                        nest_dict(structure.as_dict(), [name]))

        return MPFile.from_dict(rdct)
def run(mpfile, nmax=None, dup_check_test_site=True):

    existing_mpids = {}
    for b in [False, True]:
        with DibbsRester(test_site=b) as mpr:
            for doc in mpr.query_contributions(criteria=mpr.dibbs_query):
                existing_mpids[doc['mp_cat_id']] = doc['_id']
        if not dup_check_test_site:
            break

    general = mpfile.document[mp_level01_titles[0]]
    input_file = general.pop('input_file')
    df = read_excel(input_file)
    columns_map = RecursiveDict([
        (v, k) for k, v in general.pop('columns_map').items()
    ])
    columns = columns_map.keys()
    df = df[columns]
    df = df[notnull(df[columns[-1]]) & notnull(df[columns[1]])]
    mpfile.add_hierarchical_data({'title': 'DIBBS - 27Al NMR'})

    count, skipped, update = 0, 0, 0
    for idx, row in df.iterrows():
        url = row[columns[-1]]
        if not url.startswith('http'):
            continue

        # hierarchical data
        d = RecursiveDict()
        for col in columns[:4]:
            d[columns_map[col]] = unidecode(row[col]) \
                    if isinstance(row[col], six.string_types) else row[col]

        if d['name'] in [
            'AlPO4 Tridymite (AlPO4-t)', 'Basic aluminum sulfate', 'Cryolite',
            'berlinite(AlPO4-Q)'
        ]:
            continue

        d['data'] = RecursiveDict()
        for col in columns[4:8]:
            if notnull(row[col]):
                value = unicode('{}'.format(row[col]), 'utf-8')
                if col == columns[4]:
                    value += ' ppm'
                elif col == columns[6]:
                    value += ' MHz'
                elif col == columns[7]:
                    value = ' '.join([value[:-1], value[-1]])
            else:
                value = u''
            d['data'][columns_map[col]] = value

        # structure
        if url.startswith('https://materialsproject.org'):
            mpid = url.split('/')[-2]
        else:
            #print 'retrieve cif and match to MP structure ...'
            d[columns_map[columns[-1]]] = url
            f = requests.get(url)

            try:
                mpid = mpfile.add_structure(f.text, name=d['name'], fmt='cif')
            except ValueError as ex:
                print d['name'], str(ex)
                continue

            if nmax is not None and mpid in existing_mpids:
                item = mpfile.document.popitem(last=True)
                print 'removed duplicate', mpid

        if nmax is not None and mpid in existing_mpids:
            print 'skipping', mpid
            skipped += 1
            continue # skip duplicates

        mpfile.add_hierarchical_data(d, identifier=mpid)
        print 'added {} ({})'.format(d['name'], mpid)

        if mpid in existing_mpids:
            cid = existing_mpids[mpid]
            mpfile.insert_id(mpid, cid)
            update += 1
        if nmax is not None and count >= nmax-1:
            break
        count += 1

    print len(mpfile.ids), 'mp-ids to submit.'
    if nmax is None and update > 0:
        print update, 'mp-ids to update.'
    if nmax is not None and skipped > 0:
        print skipped, 'duplicates to skip.'
示例#7
0
def run(mpfile, nmax=None, dup_check_test_site=True):

    existing_mpids = {}
    for b in [False, True]:
        with DibbsRester(test_site=b) as mpr:
            for doc in mpr.query_contributions(criteria=mpr.dibbs_query):
                existing_mpids[doc['mp_cat_id']] = doc['_id']
        if not dup_check_test_site:
            break

    general = mpfile.document[mp_level01_titles[0]]
    input_file = general.pop('input_file')
    df = read_excel(input_file)
    columns_map = RecursiveDict([
        (v, k) for k, v in general.pop('columns_map').items()
    ])
    columns = columns_map.keys()
    df = df[columns]
    df = df[notnull(df[columns[-1]]) & notnull(df[columns[1]])]
    mpfile.add_hierarchical_data({'title': 'DIBBS - 27Al NMR'})

    count, skipped, update = 0, 0, 0
    for idx, row in df.iterrows():
        url = row[columns[-1]]
        if not url.startswith('http'):
            continue

        # hierarchical data
        d = RecursiveDict()
        for col in columns[:4]:
            d[columns_map[col]] = unidecode(row[col]) \
                    if isinstance(row[col], six.string_types) else row[col]

        if d['name'] in [
                'AlPO4 Tridymite (AlPO4-t)', 'Basic aluminum sulfate',
                'Cryolite', 'berlinite(AlPO4-Q)'
        ]:
            continue

        d['data'] = RecursiveDict()
        for col in columns[4:8]:
            if notnull(row[col]):
                value = unicode('{}'.format(row[col]), 'utf-8')
                if col == columns[4]:
                    value += ' ppm'
                elif col == columns[6]:
                    value += ' MHz'
                elif col == columns[7]:
                    value = ' '.join([value[:-1], value[-1]])
            else:
                value = u''
            d['data'][columns_map[col]] = value

        # structure
        if url.startswith('https://materialsproject.org'):
            mpid = url.split('/')[-2]
        else:
            #print 'retrieve cif and match to MP structure ...'
            d[columns_map[columns[-1]]] = url
            f = requests.get(url)

            try:
                mpid = mpfile.add_structure(f.text, name=d['name'], fmt='cif')
            except ValueError as ex:
                print d['name'], str(ex)
                continue

            if nmax is not None and mpid in existing_mpids:
                item = mpfile.document.popitem(last=True)
                print 'removed duplicate', mpid

        if nmax is not None and mpid in existing_mpids:
            print 'skipping', mpid
            skipped += 1
            continue  # skip duplicates

        mpfile.add_hierarchical_data(d, identifier=mpid)
        print 'added {} ({})'.format(d['name'], mpid)

        if mpid in existing_mpids:
            cid = existing_mpids[mpid]
            mpfile.insert_id(mpid, cid)
            update += 1
        if nmax is not None and count >= nmax - 1:
            break
        count += 1

    print len(mpfile.ids), 'mp-ids to submit.'
    if nmax is None and update > 0:
        print update, 'mp-ids to update.'
    if nmax is not None and skipped > 0:
        print skipped, 'duplicates to skip.'
示例#8
0
def run(mpfile, **kwargs):
    from pymatgen import Structure

    reference_project = None
    input_data, input_keys, extra = RecursiveDict(), RecursiveDict(
    ), RecursiveDict()
    input_urls = mpfile.document['_hdata'].pop('input_urls')

    for project in input_urls:
        input_url = input_urls[project]['file']
        if '{}' in input_url:
            input_url = input_url.format('2d')  # TODO 3d for Jarvis

        dbfile = os.path.join(os.environ['HOME'], 'work',
                              input_url.rsplit('/')[-1])
        if not os.path.exists(dbfile):
            print 'downloading', dbfile, '...'
            urllib.urlretrieve(input_url, dbfile)

        ext = os.path.splitext(dbfile)[1]
        is_nus = bool(ext == '.gz')
        id_key = 'parent_id' if is_nus else 'mpid'
        if not is_nus:
            with tarfile.open(dbfile, "r:gz") as tar:
                member = tar.getmembers()[0]
                raw_data = json.load(tar.extractfile(member), cls=MontyDecoder)
        else:
            reference_project = project
            raw_data = []
            with gzip.open(dbfile, 'rb') as f:
                for line in f:
                    raw_data.append(json.loads(line, cls=MontyDecoder))
        input_data[project] = RecursiveDict((d[id_key], d) for d in raw_data)

        input_keys[project] = [
            'material_id', 'exfoliation_energy_per_atom', 'structure'
        ] if is_nus else ['jid', 'exfoliation_en', 'final_str']
        extra[project] = [
            ('fin_en', ('E', 'meV/atom')),
            ('op_gap', ('ΔE|optB88vdW', 'meV/atom')),
            ('mbj_gap', ('ΔE|mbj', 'meV/atom')),
            #('kv', ('Kᵥ', 'GPa')),
            #('gv', ('Gᵥ', 'GPa'))
        ] if not is_nus else []

        print len(input_data[project]), 'materials loaded for', project

    projects = input_data.keys()
    identifiers = []
    for d in input_data.values():
        identifiers += list(d.keys())

    for identifier in identifiers:
        data, structures = RecursiveDict(), RecursiveDict()

        for project in projects:
            if project not in data:
                data[project] = RecursiveDict()
            if identifier in input_data[project]:
                d = input_data[project][identifier]
                structures[project] = d[input_keys[project][-1]]
                if data.get('formula') is None:
                    data['formula'] = get_composition_from_string(
                        structures[project].composition.reduced_formula)
                data[project]['id'] = input_urls[project]['detail'].format(
                    d[input_keys[project][0]])
                Ex = d[input_keys[project][1]]
                if project == reference_project:
                    Ex *= 1000.
                data[project]['Eₓ'] = clean_value(Ex, 'eV')
                for k, (sym, unit) in extra[project]:
                    if d[k] != 'na':
                        data[project][sym] = clean_value(d[k], unit)

        mpfile.add_hierarchical_data(nest_dict(data, ['data']),
                                     identifier=identifier)
        for project, structure in structures.items():
            name = '{}_{}'.format(data['formula'], project)
            try:
                mpfile.add_structure(structure,
                                     name=name,
                                     identifier=identifier)
            except Exception as ex:
                print str(ex)