예제 #1
0
    def test_to_from_dict(self):
        # no complicated objects in the 'data' or 'nodes' field
        a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg,
                        ['remark1'], {"_my_data": "string"},
                        [self.valid_node, self.valid_node2])
        b = StructureNL.from_dict(a.as_dict())
        self.assertEqual(a, b)
        # complicated objects in the 'data' and 'nodes' field
        complicated_node = {
            "name": "complicated node",
            "url": "www.complicatednodegoeshere.com",
            "description": {
                "structure": self.s2
            }
        }
        a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg,
                        ['remark1'], {"_my_data": {
                            "structure": self.s2
                        }}, [complicated_node, self.valid_node])
        b = StructureNL.from_dict(a.as_dict())
        self.assertEqual(
            a, b, 'to/from dict is broken when object embedding is '
            'used! Apparently MontyEncoding is broken...')

        # Test molecule
        molnl = StructureNL(self.mol, self.hulk, references=self.pmg)
        b = StructureNL.from_dict(molnl.as_dict())
        self.assertEqual(molnl, b)
예제 #2
0
    def test_to_from_dict(self):
        # no complicated objects in the 'data' or 'nodes' field
        a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg,
                        ['remark1'], {"_my_data": "string"},
                        [self.valid_node, self.valid_node2])
        b = StructureNL.from_dict(a.as_dict())
        self.assertEqual(a, b)
        # complicated objects in the 'data' and 'nodes' field
        complicated_node = {"name": "complicated node",
                            "url": "www.complicatednodegoeshere.com",
                            "description": {"structure": self.s2}}
        a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg,
                        ['remark1'], {"_my_data": {"structure": self.s2}},
                        [complicated_node, self.valid_node])
        b = StructureNL.from_dict(a.as_dict())
        self.assertEqual(a, b,
                         'to/from dict is broken when object embedding is '
                         'used! Apparently MontyEncoding is broken...')

        #Test molecule
        molnl = StructureNL(self.mol, self.hulk, references=self.pmg)
        b = StructureNL.from_dict(molnl.as_dict())
        self.assertEqual(molnl, b)
예제 #3
0
파일: calc.py 프로젝트: utf/emmet
def prep(ctx, archive, authors):
    """prep structures from an archive for submission"""
    run = ctx.obj["RUN"]
    collections = ctx.obj["COLLECTIONS"]
    snl_collection = ctx.obj["CLIENT"].db.snls
    handler = ctx.obj["MONGO_HANDLER"]
    nmax = ctx.obj["NMAX"]
    skip = ctx.obj["SKIP"]
    # TODO no_dupe_check flag

    fname, ext = os.path.splitext(os.path.basename(archive))
    tag, sec_ext = fname.rsplit(".", 1) if "." in fname else [fname, ""]
    logger.info(click.style(f"tag: {tag}", fg="cyan"))
    if sec_ext:
        ext = "".join([sec_ext, ext])
    exts = ["tar.gz", ".tgz", "bson.gz", ".zip"]
    if ext not in exts:
        raise EmmetCliError(
            f"{ext} not supported (yet)! Please use one of {exts}.")

    meta = {"authors": [Author.parse_author(a) for a in authors]}
    references = meta.get("references", "").strip()
    source_ids_scanned = handler.collection.distinct("source_id",
                                                     {"tags": tag})

    # TODO add archive of StructureNL files
    input_structures, source_total = [], None
    if ext == "bson.gz":
        input_bson = gzip.open(archive)
        source_total = count_file_documents(input_bson)
        for doc in bson.decode_file_iter(input_bson):
            if len(input_structures) >= nmax:
                break
            if skip and doc["db_id"] in source_ids_scanned:
                continue
            elements = set([
                specie["element"] for site in doc["structure"]["sites"]
                for specie in site["species"]
            ])
            for l in SETTINGS.skip_labels:
                if l in elements:
                    logger.log(
                        logging.ERROR if run else logging.INFO,
                        f'Skip structure {doc["db_id"]}: unsupported element {l}!',
                        extra={
                            "tags": [tag],
                            "source_id": doc["db_id"]
                        },
                    )
                    break
            else:
                s = TransformedStructure.from_dict(doc["structure"])
                s.source_id = doc["db_id"]
                input_structures.append(s)
    elif ext == ".zip":
        input_zip = ZipFile(archive)
        namelist = input_zip.namelist()
        source_total = len(namelist)
        for fname in namelist:
            if len(input_structures) >= nmax:
                break
            if skip and fname in source_ids_scanned:
                continue
            contents = input_zip.read(fname)
            fmt = get_format(fname)
            s = Structure.from_str(contents, fmt=fmt)
            s.source_id = fname
            input_structures.append(s)
    else:
        tar = tarfile.open(archive, "r:gz")
        members = tar.getmembers()
        source_total = len(members)
        for member in members:
            if os.path.basename(member.name).startswith("."):
                continue
            if len(input_structures) >= nmax:
                break
            fname = member.name.lower()
            if skip and fname in source_ids_scanned:
                continue
            f = tar.extractfile(member)
            if f:
                contents = f.read().decode("utf-8")
                fmt = get_format(fname)
                s = Structure.from_str(contents, fmt=fmt)
                s.source_id = fname
                input_structures.append(s)

    total = len(input_structures)
    logger.info(
        f"{total} of {source_total} structure(s) loaded "
        f"({len(source_ids_scanned)} unique structures already scanned).")

    save_logs(ctx)
    snls, index = [], None
    for istruct in input_structures:
        # number of log messages equals number of structures processed if --run
        # only logger.warning goes to DB if --run
        if run and len(handler.buffer) >= handler.buffer_size:
            insert_snls(ctx, snls)

        struct = (istruct.final_structure if isinstance(
            istruct, TransformedStructure) else istruct)
        struct.remove_oxidation_states()
        struct = struct.get_primitive_structure()
        formula = struct.composition.reduced_formula
        sg = get_sg(struct)

        if not (struct.is_ordered and struct.is_valid()):
            logger.log(
                logging.WARNING if run else logging.INFO,
                f"Skip structure {istruct.source_id}: disordered or invalid!",
                extra={
                    "formula": formula,
                    "spacegroup": sg,
                    "tags": [tag],
                    "source_id": istruct.source_id,
                },
            )
            continue

        for full_name, coll in collections.items():
            # load canonical structures in collection for current formula and
            # duplicate-check them against current structure
            load_canonical_structures(ctx, full_name, formula)
            for canonical_structure in canonical_structures[full_name][
                    formula].get(sg, []):
                if structures_match(struct, canonical_structure):
                    logger.log(
                        logging.WARNING if run else logging.INFO,
                        f"Duplicate for {istruct.source_id} ({formula}/{sg}): {canonical_structure.id}",
                        extra={
                            "formula": formula,
                            "spacegroup": sg,
                            "tags": [tag],
                            "source_id": istruct.source_id,
                            "duplicate_dbname": full_name,
                            "duplicate_id": canonical_structure.id,
                        },
                    )
                    break
            else:
                continue  # no duplicate found -> continue to next collection

            break  # duplicate found
        else:
            # no duplicates in any collection
            prefix = snl_collection.database.name
            if index is None:
                # get start index for SNL id
                snl_ids = snl_collection.distinct("snl_id")
                index = max(
                    [int(snl_id[len(prefix) + 1:]) for snl_id in snl_ids])

            index += 1
            snl_id = "{}-{}".format(prefix, index)
            kwargs = {"references": references, "projects": [tag]}
            if isinstance(istruct, TransformedStructure):
                snl = istruct.to_snl(meta["authors"], **kwargs)
            else:
                snl = StructureNL(istruct, meta["authors"], **kwargs)

            snl_dct = snl.as_dict()
            snl_dct.update(get_meta_from_structure(struct))
            snl_dct["snl_id"] = snl_id
            snls.append(snl_dct)
            logger.log(
                logging.WARNING if run else logging.INFO,
                f"SNL {snl_id} created for {istruct.source_id} ({formula}/{sg})",
                extra={
                    "formula": formula,
                    "spacegroup": sg,
                    "tags": [tag],
                    "source_id": istruct.source_id,
                },
            )

    # final save
    if run:
        insert_snls(ctx, snls)
예제 #4
0
    filename = stoich + "_" + energy_order_prefix + "_" + unique_id
    filename += ".json"

    
    atoms = row_i.atoms
    struct = AseAtomsAdaptor().get_structure(atoms)

    extra_data = {
        "_MPContribs_Internal_ID": unique_id,
        }

    struct_NL = StructureNL(
        struct,
        authors,
        projects=None,
        references="",
        remarks=remarks,
        data=extra_data,
        # history=extra_data,
        created_at=date,
        )

    path_i = os.path.join("out_data", filename)
    with open(path_i,"w") as file:
        json.dump(
            struct_NL.as_dict(),
            file,
            indent=2,
            )
예제 #5
0
파일: utils.py 프로젝트: rkingsbury/emmet
def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas):  # noqa: C901
    process = multiprocessing.current_process()
    name = process.name
    chunk_idx = int(name.rsplit("-")[1]) - 1
    logger.info(f"{name} starting.")
    tags = [tag, SETTINGS.year_tags[-1]]
    ctx = click.get_current_context()
    spec_or_dbfile = ctx.parent.parent.params["spec_or_dbfile"]
    target = calcdb_from_mgrant(spec_or_dbfile)
    snl_collection = target.db.snls_user
    sbxn = list(filter(None, target.collection.distinct("sbxn")))
    logger.info(f"Using sandboxes {sbxn}.")
    no_dupe_check = ctx.parent.parent.params["no_dupe_check"]
    run = ctx.parent.parent.params["run"]
    projection = {"tags": 1, "task_id": 1}
    count = 0
    drone = VaspDrone(
        additional_fields={"tags": tags},
        store_volumetric_data=ctx.params["store_volumetric_data"],
    )

    for vaspdir in vaspdirs:
        logger.info(f"{name} VaspDir: {vaspdir}")
        launcher = get_subdir(vaspdir)
        query = {"dir_name": {"$regex": launcher}}
        docs = list(
            target.collection.find(query,
                                   projection).sort([("_id", -1)]).limit(1))

        if docs:
            if no_dupe_check:
                logger.warning(f"FORCING re-parse of {launcher}!")
            else:
                if run:
                    shutil.rmtree(vaspdir)
                    logger.warning(
                        f"{name} {launcher} already parsed -> removed.")
                else:
                    logger.warning(
                        f"{name} {launcher} already parsed -> would remove.")
                continue

        try:
            task_doc = drone.assimilate(vaspdir)
        except Exception as ex:
            logger.error(f"Failed to assimilate {vaspdir}: {ex}")
            continue

        task_doc["sbxn"] = sbxn
        manual_taskid = isinstance(task_ids, dict)
        snl_metas_avail = isinstance(snl_metas, dict)
        task_id = task_ids[launcher] if manual_taskid else task_ids[chunk_idx][
            count]
        task_doc["task_id"] = task_id
        logger.info(f"Using {task_id} for {launcher}.")

        if docs:
            # make sure that task gets the same tags as the previously parsed task
            # (run through set to implicitly remove duplicate tags)
            if docs[0]["tags"]:
                existing_tags = list(set(docs[0]["tags"]))
                task_doc["tags"] += existing_tags
                logger.info(f"Adding existing tags {existing_tags} to {tags}.")

        snl_dct = None
        if snl_metas_avail:
            snl_meta = snl_metas.get(launcher)
            if snl_meta:
                references = snl_meta.get("references")
                authors = snl_meta.get(
                    "authors",
                    ["Materials Project <*****@*****.**>"])
                kwargs = {"projects": [tag]}
                if references:
                    kwargs["references"] = references

                struct = Structure.from_dict(task_doc["input"]["structure"])
                snl = StructureNL(struct, authors, **kwargs)
                snl_dct = snl.as_dict()
                snl_dct.update(get_meta_from_structure(struct))
                snl_id = snl_meta["snl_id"]
                snl_dct["snl_id"] = snl_id
                logger.info(f"Created SNL object for {snl_id}.")

        if run:
            if task_doc["state"] == "successful":
                if docs and no_dupe_check:
                    target.collection.remove({"task_id": task_id})
                    logger.warning(
                        f"Removed previously parsed task {task_id}!")

                try:
                    target.insert_task(task_doc, use_gridfs=True)
                except DocumentTooLarge:
                    output = dotty(task_doc["calcs_reversed"][0]["output"])
                    pop_keys = [
                        "normalmode_eigenvecs",
                        "force_constants",
                        "outcar.onsite_density_matrices",
                    ]

                    for k in pop_keys:
                        if k not in output:
                            continue

                        logger.warning(f"{name} Remove {k} and retry ...")
                        output.pop(k)
                        try:
                            target.insert_task(task_doc, use_gridfs=True)
                            break
                        except DocumentTooLarge:
                            continue
                    else:
                        logger.warning(
                            f"{name} failed to reduce document size")
                        continue

                if target.collection.count(query):
                    if snl_dct:
                        result = snl_collection.insert_one(snl_dct)
                        logger.info(
                            f"SNL {result.inserted_id} inserted into {snl_collection.full_name}."
                        )

                    shutil.rmtree(vaspdir)
                    logger.info(
                        f"{name} Successfully parsed and removed {launcher}.")
                    count += 1
        else:
            count += 1

    return count
예제 #6
0
파일: icsd_to_mongo.py 프로젝트: kmu/emmet
    def assimilate(self,
                   path,
                   dbhost='localhost',
                   dbport=27017,
                   dbname='ICSD',
                   collection_name='ICSD_files',
                   store_mongo=True):
        """
        Assimilate data in a directory path into a pymatgen object. Because of
        the quirky nature of Python"s multiprocessing, the object must support
        pymatgen's as_dict() for parallel processing.
        Args:
            path: directory path
        Returns:
            An assimilated object
        """
        if store_mongo:
            client = MongoClient(dbhost, dbport)
            db = client[dbname]
            col = db[collection_name]

        data = {}

        files = os.listdir(path)
        file_ID = path.split('/')[-1]
        print(file_ID)
        data['icsd_id'] = int(file_ID)

        #data['cifwarnings'] = []
        cif_path = os.path.join(path, file_ID + '.cif')

        # capture any warnings generated by parsing cif file
        with warnings.catch_warnings(record=True) as w:
            cif_parser = CifParser(cif_path)
            for warn in w:
                if 'cifwarnings' in data:
                    data['cifwarnings'].append(str(warn.message))
                else:
                    data['cifwarnings'] = [str(warn.message)]
                logger.warning('{}: {}'.format(file_ID, warn.message))

        cif_dict = cif_parser.as_dict()
        orig_id = list(cif_dict.keys())[0]
        easy_dict = cif_dict[orig_id]

        if '_chemical_name_mineral' in easy_dict:
            data['min_name'] = easy_dict['_chemical_name_mineral']
        if '_chemical_name_systematic' in easy_dict:
            data['chem_name'] = easy_dict['_chemical_name_systematic']
        if '_cell_measurement_pressure' in easy_dict:
            data['pressure'] = float(
                easy_dict['_cell_measurement_pressure']) / 1000
        else:
            data['pressure'] = .101325

        with warnings.catch_warnings(record=True) as w:
            try:
                struc = cif_parser.get_structures()[0]
            except ValueError as err:
                # if cif parsing raises error, write icsd_id to Error_Record and do NOT add structure to mongo database
                logger.error(
                    file_ID + ': {}'.format(err) +
                    "\nDid not insert structure into Mongo Collection")
                with open('Error_Record', 'a') as err_rec:
                    err_rec.write(str(file_ID) + ': {}\n'.format(err))
                    err_rec.close()
            else:
                authors = 'Donny Winston<*****@*****.**>, Joseph Palakapilly<*****@*****.**>'
                references = self.bibtex_from_cif(cif_path)
                history = [{
                    'name': 'ICSD',
                    'url': 'https://icsd.fiz-karlsruhe.de/',
                    'description': {
                        'icsd_id': file_ID
                    }
                }]
                snl = StructureNL(struc,
                                  authors=authors,
                                  references=references,
                                  history=history)
                data['snl'] = snl.as_dict()

                meta = get_meta_from_structure(struc)
                data['nsites'] = meta['nsites']
                data['elements'] = meta['elements']
                data['nelements'] = meta['nelements']
                data['formula'] = meta['formula']
                data['formula_reduced'] = meta['formula_pretty']
                data['formula_reduced_abc'] = meta['formula_reduced_abc']
                data['formula_anonymous'] = meta['formula_anonymous']
                data['chemsys'] = meta['chemsys']
                data['is_valid'] = meta['is_valid']
                data['is_ordered'] = meta['is_ordered']

            #unfortunately any warnings are logged after any errors. Not too big of an issue
            for warn in w:
                if 'cifwarnings' in data:
                    data['cifwarnings'].append(str(warn.message))
                else:
                    data['cifwarnings'] = [str(warn.message)]
                logger.warning('{}: {}'.format(file_ID, warn.message))

        if 'snl' in data:
            if store_mongo:
                col.update_one({'icsd_id': int(file_ID)}, {'$set': data},
                               upsert=True)

        return data