예제 #1
0
    def test_from_structures(self):
        s1 = Structure([[5, 0, 0], [0, 5, 0], [0, 0, 5]], ["Fe"], [[0, 0, 0]])
        s2 = Structure([[5, 0, 0], [0, 5, 0], [0, 0, 5]], ["Mn"], [[0, 0, 0]])
        remarks = ["unittest"]
        authors="Test User <*****@*****.**>"
        snl_list = StructureNL.from_structures([s1, s2], authors, remarks=remarks)

        self.assertEqual(len(snl_list), 2)
        snl1 = snl_list[0]
        snl2 = snl_list[1]
        self.assertEqual(snl1.remarks, remarks)
        self.assertEqual(snl2.remarks, remarks)
        self.assertEqual(snl1.authors, [Author.parse_author(authors)])
        self.assertEqual(snl2.authors, [Author.parse_author(authors)])
예제 #2
0
def snls(structure):

    docs = [
        StructureNL(
            structure,
            authors=[Author("test{i}", "*****@*****.**").as_dict()],
            history=[HistoryNode("nothing", "url.com", {})],
            created_at=datetime.utcnow(),
        ).as_dict() for i in range(3)
    ]
    docs[0]["snl_id"] = "icsd-2"
    docs[1]["snl_id"] = "user-1"
    docs[2]["snl_id"] = "pf-3"

    return docs
예제 #3
0
파일: calc.py 프로젝트: utf/emmet
def prep(ctx, archive, authors):
    """prep structures from an archive for submission"""
    run = ctx.obj["RUN"]
    collections = ctx.obj["COLLECTIONS"]
    snl_collection = ctx.obj["CLIENT"].db.snls
    handler = ctx.obj["MONGO_HANDLER"]
    nmax = ctx.obj["NMAX"]
    skip = ctx.obj["SKIP"]
    # TODO no_dupe_check flag

    fname, ext = os.path.splitext(os.path.basename(archive))
    tag, sec_ext = fname.rsplit(".", 1) if "." in fname else [fname, ""]
    logger.info(click.style(f"tag: {tag}", fg="cyan"))
    if sec_ext:
        ext = "".join([sec_ext, ext])
    exts = ["tar.gz", ".tgz", "bson.gz", ".zip"]
    if ext not in exts:
        raise EmmetCliError(
            f"{ext} not supported (yet)! Please use one of {exts}.")

    meta = {"authors": [Author.parse_author(a) for a in authors]}
    references = meta.get("references", "").strip()
    source_ids_scanned = handler.collection.distinct("source_id",
                                                     {"tags": tag})

    # TODO add archive of StructureNL files
    input_structures, source_total = [], None
    if ext == "bson.gz":
        input_bson = gzip.open(archive)
        source_total = count_file_documents(input_bson)
        for doc in bson.decode_file_iter(input_bson):
            if len(input_structures) >= nmax:
                break
            if skip and doc["db_id"] in source_ids_scanned:
                continue
            elements = set([
                specie["element"] for site in doc["structure"]["sites"]
                for specie in site["species"]
            ])
            for l in SETTINGS.skip_labels:
                if l in elements:
                    logger.log(
                        logging.ERROR if run else logging.INFO,
                        f'Skip structure {doc["db_id"]}: unsupported element {l}!',
                        extra={
                            "tags": [tag],
                            "source_id": doc["db_id"]
                        },
                    )
                    break
            else:
                s = TransformedStructure.from_dict(doc["structure"])
                s.source_id = doc["db_id"]
                input_structures.append(s)
    elif ext == ".zip":
        input_zip = ZipFile(archive)
        namelist = input_zip.namelist()
        source_total = len(namelist)
        for fname in namelist:
            if len(input_structures) >= nmax:
                break
            if skip and fname in source_ids_scanned:
                continue
            contents = input_zip.read(fname)
            fmt = get_format(fname)
            s = Structure.from_str(contents, fmt=fmt)
            s.source_id = fname
            input_structures.append(s)
    else:
        tar = tarfile.open(archive, "r:gz")
        members = tar.getmembers()
        source_total = len(members)
        for member in members:
            if os.path.basename(member.name).startswith("."):
                continue
            if len(input_structures) >= nmax:
                break
            fname = member.name.lower()
            if skip and fname in source_ids_scanned:
                continue
            f = tar.extractfile(member)
            if f:
                contents = f.read().decode("utf-8")
                fmt = get_format(fname)
                s = Structure.from_str(contents, fmt=fmt)
                s.source_id = fname
                input_structures.append(s)

    total = len(input_structures)
    logger.info(
        f"{total} of {source_total} structure(s) loaded "
        f"({len(source_ids_scanned)} unique structures already scanned).")

    save_logs(ctx)
    snls, index = [], None
    for istruct in input_structures:
        # number of log messages equals number of structures processed if --run
        # only logger.warning goes to DB if --run
        if run and len(handler.buffer) >= handler.buffer_size:
            insert_snls(ctx, snls)

        struct = (istruct.final_structure if isinstance(
            istruct, TransformedStructure) else istruct)
        struct.remove_oxidation_states()
        struct = struct.get_primitive_structure()
        formula = struct.composition.reduced_formula
        sg = get_sg(struct)

        if not (struct.is_ordered and struct.is_valid()):
            logger.log(
                logging.WARNING if run else logging.INFO,
                f"Skip structure {istruct.source_id}: disordered or invalid!",
                extra={
                    "formula": formula,
                    "spacegroup": sg,
                    "tags": [tag],
                    "source_id": istruct.source_id,
                },
            )
            continue

        for full_name, coll in collections.items():
            # load canonical structures in collection for current formula and
            # duplicate-check them against current structure
            load_canonical_structures(ctx, full_name, formula)
            for canonical_structure in canonical_structures[full_name][
                    formula].get(sg, []):
                if structures_match(struct, canonical_structure):
                    logger.log(
                        logging.WARNING if run else logging.INFO,
                        f"Duplicate for {istruct.source_id} ({formula}/{sg}): {canonical_structure.id}",
                        extra={
                            "formula": formula,
                            "spacegroup": sg,
                            "tags": [tag],
                            "source_id": istruct.source_id,
                            "duplicate_dbname": full_name,
                            "duplicate_id": canonical_structure.id,
                        },
                    )
                    break
            else:
                continue  # no duplicate found -> continue to next collection

            break  # duplicate found
        else:
            # no duplicates in any collection
            prefix = snl_collection.database.name
            if index is None:
                # get start index for SNL id
                snl_ids = snl_collection.distinct("snl_id")
                index = max(
                    [int(snl_id[len(prefix) + 1:]) for snl_id in snl_ids])

            index += 1
            snl_id = "{}-{}".format(prefix, index)
            kwargs = {"references": references, "projects": [tag]}
            if isinstance(istruct, TransformedStructure):
                snl = istruct.to_snl(meta["authors"], **kwargs)
            else:
                snl = StructureNL(istruct, meta["authors"], **kwargs)

            snl_dct = snl.as_dict()
            snl_dct.update(get_meta_from_structure(struct))
            snl_dct["snl_id"] = snl_id
            snls.append(snl_dct)
            logger.log(
                logging.WARNING if run else logging.INFO,
                f"SNL {snl_id} created for {istruct.source_id} ({formula}/{sg})",
                extra={
                    "formula": formula,
                    "spacegroup": sg,
                    "tags": [tag],
                    "source_id": istruct.source_id,
                },
            )

    # final save
    if run:
        insert_snls(ctx, snls)
예제 #4
0
def process_mpfile(path_or_mpfile, target=None, fmt='archieml', ids=None):
    try:
        if isinstance(path_or_mpfile, six.string_types) and \
           not os.path.isfile(path_or_mpfile):
            raise Exception('{} not found'.format(path_or_mpfile))

        if ids is not None and not isinstance(ids, list) and not len(ids) == 2:
            raise Exception('{} is not list of length 2!'.format(ids))

        from pymatgen.analysis.structure_matcher import StructureMatcher
        mod = import_module('mpcontribs.io.{}.mpfile'.format(fmt))
        MPFile = getattr(mod, 'MPFile')
        full_name = pwd.getpwuid(os.getuid())[4]
        contributor = '{} <*****@*****.**>'.format(full_name)  # fake
        cma = ContributionMongoAdapter()
        axes, ov_data = set(), dict()
        mpfile_out, cid_shorts = MPFile(), []  # output
        sm = StructureMatcher(primitive_cell=False, scale=False)

        # split input MPFile into contributions: treat every mp_cat_id as separate DB insert
        mpfile_in = path_or_mpfile
        if isinstance(path_or_mpfile, six.string_types) or isinstance(
                path_or_mpfile, StringIO):
            mpfile_in = MPFile.from_file(path_or_mpfile)
        for idx, mpfile_single in enumerate(mpfile_in.split()):

            mp_cat_id = mpfile_single.document.keys()[0]
            if ids is None or mp_cat_id == ids[0]:

                cid = mpfile_single.document[mp_cat_id].get('cid', None)
                update = bool(cid is not None)
                if update:
                    cid_short = get_short_object_id(cid)
                    yield 'use #{} to update #{} ... '.format(idx, cid_short)

                # always run local "submission" to catch failure before interacting with DB
                yield 'process #{} ({}) ... '.format(idx, mp_cat_id)
                doc = cma.submit_contribution(
                    mpfile_single, contributor)  # does not use get_string
                cid = doc['_id']
                cid_short = get_short_object_id(cid)
                if ids is None or cid_short == ids[1]:

                    yield 'check ... '
                    obj_size = asizeof.asizeof(mpfile_single) / 1024. / 1024.
                    if obj_size > 0.5:
                        yield 'skip ({:.3f}MB) ... '.format(obj_size)
                    else:
                        try:
                            mpfile_single_cmp_str = mpfile_single.get_string()
                        except Exception as ex:
                            yield 'get_string() FAILED!<br>'
                            continue
                        try:
                            mpfile_single_cmp = MPFile.from_string(
                                mpfile_single_cmp_str)
                        except Exception as ex:
                            yield 'from_string() FAILED!<br>'
                            continue
                        if mpfile_single.document != mpfile_single_cmp.document:
                            yield 'check again ... '
                            found_inconsistency = False
                            # check structural data
                            structures_ok = True
                            for name, s1 in mpfile_single.sdata[
                                    mp_cat_id].iteritems():
                                s2 = mpfile_single_cmp.sdata[mp_cat_id][name]
                                if s1 != s2:
                                    if len(s1) != len(s2):
                                        yield 'different number of sites: {} -> {}!<br>'.format(
                                            len(s1), len(s2))
                                        structures_ok = False
                                        break
                                    if s1.lattice != s2.lattice:
                                        yield 'lattices different!<br>'
                                        structures_ok = False
                                        break
                                    for site in s1:
                                        if site not in s2:
                                            found_inconsistency = True
                                            if not sm.fit(s1, s2):
                                                yield 'structures do not match!<br>'
                                                structures_ok = False
                                            break
                                        if not structures_ok:
                                            break
                            if not structures_ok:
                                continue
                            # check hierarchical and tabular data
                            # compare json strings to find first inconsistency
                            json_compare(mpfile_single.hdata,
                                         mpfile_single_cmp.hdata)
                            json_compare(mpfile_single.tdata,
                                         mpfile_single_cmp.tdata)
                            if not found_inconsistency:
                                # documents are not equal, but all components checked, skip contribution
                                # should not happen
                                yield 'inconsistency found but not identified!<br>'
                                continue

                    if target is not None:
                        yield 'submit ... '
                        cid = target.submit_contribution(
                            mpfile_single, fmt)  # uses get_string
                    mpfile_single.insert_id(mp_cat_id, cid)
                    cid_shorts.append(cid_short)

                    if target is not None:
                        if idx < 5:
                            yield 'build ... '
                            url = target.build_contribution(cid)
                            url = '/'.join([
                                target.preamble.rsplit('/', 1)[0], 'explorer',
                                url
                            ])
                            yield (
                                "OK. <a href='{}' class='btn btn-default btn-xs' "
                                + "role='button' target='_blank'>View</a></br>"
                            ).format(url)
                        else:
                            target.set_build_flag(cid, True)
                            yield 'OK (queued).</br>'
                    else:
                        if (ids is None and idx < 5) or ids is not None:
                            yield 'build ... '
                            mcb = MPContributionsBuilder(doc)
                            build_doc = mcb.build(contributor, cid)
                        else:
                            yield 'skip ... '
                            from pymatgen.util.provenance import Author
                            author = Author.parse_author(contributor)
                            build_doc = [mp_cat_id, author.name, cid_short, '']
                        yield build_doc

                        yield 'overview axes ... '
                        scope, local_axes = [], set()
                        mpfile_for_axes = MPFile.from_contribution(doc)
                        for k, v in mpfile_for_axes.hdata[mp_cat_id].iterate():
                            if v is None:
                                scope = scope[:k[0]]
                                scope.append(k[1])
                            else:
                                try:
                                    if k[0] == len(scope): scope.append(k[1])
                                    else: scope[-1] = k[1]
                                    vf = float(v)  # trigger exception
                                    scope_str = '.'.join(scope)
                                    if idx == 0:
                                        axes.add(scope_str)
                                        ov_data[scope_str] = {
                                            cid_short: (vf, mp_cat_id)
                                        }
                                    else:
                                        local_axes.add(scope_str)
                                        ov_data[scope_str][cid_short] = (
                                            vf, mp_cat_id)
                                except:
                                    pass
                        if idx > 0:
                            axes.intersection_update(local_axes)
                        yield 'OK.</br>'.format(idx, cid_short)

                else:
                    yield 'wrong CID.</br>'

            mpfile_out.concat(mpfile_single)
            time.sleep(.01)

        ncontribs = len(cid_shorts)
        if target is not None:
            yield '<strong>{} contributions successfully submitted.</strong>'.format(
                ncontribs)
        else:
            for k in ov_data.keys():
                if k not in axes:
                    ov_data.pop(k)
            yield ov_data
            yield '<strong>{} contributions successfully processed.</strong>'.format(
                ncontribs)
    except:
        ex = sys.exc_info()[1]
        yield 'FAILED.</br>'
        yield str(ex).replace('"', "'")
        return
예제 #5
0
    def build(self, contributor_email, cid, api_key=None, endpoint=None):
        """update materials/compositions collections with contributed data"""
        cid_short, cid_str = get_short_object_id(cid), str(cid)
        contrib = self.find_contribution(cid)
        if not contrib:
            raise Exception('Contribution {} not found!'.format(cid))
        if contributor_email not in contrib['collaborators']:
            raise ValueError(
                "Build stopped: building contribution {} not "
                "allowed due to insufficient permissions of {}! Ask "
                "someone of {} to make you a collaborator on {}.".format(
                    cid_short, contributor_email, contrib['collaborators'],
                    cid_short))
        from pymatgen.util.provenance import Author
        mpfile = MPFileCore.from_contribution(contrib)
        mp_cat_id = mpfile.ids[0]
        is_mp_id = mp_id_pattern.match(mp_cat_id)
        self.curr_coll = self.materials if is_mp_id else self.compositions
        author = Author.parse_author(contributor_email)
        project = str(author.name).translate(None, '.') \
                if 'project' not in contrib else contrib['project']

        nb = nbf.new_notebook()
        if isinstance(self.db, dict):
            contrib.pop('_id')
            if 'cid' in contrib['content']:
                contrib['content'].pop('cid')
            nb['cells'].append(
                nbf.new_code_cell(
                    "from mpcontribs.io.core.mpfile import MPFileCore\n"
                    "from mpcontribs.io.core.recdict import RecursiveDict\n"
                    "mpfile = MPFileCore.from_contribution({})\n"
                    "identifier = '{}'".format(contrib, mp_cat_id)))
        else:
            nb['cells'].append(
                nbf.new_code_cell(
                    "from mpcontribs.rest.rester import MPContribsRester"))
            os.environ['PMG_MAPI_KEY'] = api_key
            os.environ['PMG_MAPI_ENDPOINT'] = endpoint
            nb['cells'].append(
                nbf.new_code_cell(
                    "with MPContribsRester() as mpr:\n"
                    "    mpfile = mpr.find_contribution('{}')\n"
                    "    identifier = mpfile.ids[0]".format(cid)))
        nb['cells'].append(
            nbf.new_markdown_cell("## Contribution #{} for {}".format(
                cid_short, mp_cat_id)))
        nb['cells'].append(nbf.new_markdown_cell("### Hierarchical Data"))
        nb['cells'].append(nbf.new_code_cell("mpfile.hdata[identifier]"))
        if mpfile.tdata[mp_cat_id]:
            nb['cells'].append(nbf.new_markdown_cell("### Tabular Data"))
        for table_name, table in mpfile.tdata[mp_cat_id].iteritems():
            nb['cells'].append(
                nbf.new_markdown_cell("#### {}".format(table_name)))
            nb['cells'].append(
                nbf.new_code_cell(
                    "mpfile.tdata[identifier]['{}']".format(table_name)))
        if mpfile.gdata[mp_cat_id]:
            nb['cells'].append(nbf.new_markdown_cell("### Graphical Data"))
        for plot_name, plot in mpfile.gdata[mp_cat_id].iteritems():
            nb['cells'].append(
                nbf.new_markdown_cell("#### {}".format(plot_name)))
            nb['cells'].append(
                nbf.new_code_cell(
                    "mpfile.gdata[identifier]['{}']".format(plot_name)))

        if mpfile.sdata[mp_cat_id]:
            nb['cells'].append(nbf.new_markdown_cell("### Structural Data"))
        for structure_name, structure in mpfile.sdata[mp_cat_id].iteritems():
            nb['cells'].append(
                nbf.new_markdown_cell("#### {}".format(structure_name)))
            nb['cells'].append(
                nbf.new_code_cell(
                    "mpfile.sdata[identifier]['{}']".format(structure_name)))

        self.ep.preprocess(nb, {'metadata': {'path': self.nbdir}})

        if isinstance(self.db, dict):
            return [mp_cat_id, project, cid_short, export_notebook(nb, cid)]
        else:
            build_doc = RecursiveDict()
            build_doc['mp_cat_id'] = mp_cat_id
            build_doc['project'] = project
            build_doc['nb'] = nb
            self.curr_coll.update({'_id': cid}, {'$set': build_doc},
                                  upsert=True)
            return '{}/{}'.format(  # return URL for contribution page
                ('materials' if is_mp_id else 'compositions'), cid_str)