def test_to_from_dict(self): # no complicated objects in the 'data' or 'nodes' field a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg, ['remark1'], {"_my_data": "string"}, [self.valid_node, self.valid_node2]) b = StructureNL.from_dict(a.as_dict()) self.assertEqual(a, b) # complicated objects in the 'data' and 'nodes' field complicated_node = { "name": "complicated node", "url": "www.complicatednodegoeshere.com", "description": { "structure": self.s2 } } a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg, ['remark1'], {"_my_data": { "structure": self.s2 }}, [complicated_node, self.valid_node]) b = StructureNL.from_dict(a.as_dict()) self.assertEqual( a, b, 'to/from dict is broken when object embedding is ' 'used! Apparently MontyEncoding is broken...') # Test molecule molnl = StructureNL(self.mol, self.hulk, references=self.pmg) b = StructureNL.from_dict(molnl.as_dict()) self.assertEqual(molnl, b)
def test_to_from_dict(self): # no complicated objects in the 'data' or 'nodes' field a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg, ['remark1'], {"_my_data": "string"}, [self.valid_node, self.valid_node2]) b = StructureNL.from_dict(a.as_dict()) self.assertEqual(a, b) # complicated objects in the 'data' and 'nodes' field complicated_node = {"name": "complicated node", "url": "www.complicatednodegoeshere.com", "description": {"structure": self.s2}} a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg, ['remark1'], {"_my_data": {"structure": self.s2}}, [complicated_node, self.valid_node]) b = StructureNL.from_dict(a.as_dict()) self.assertEqual(a, b, 'to/from dict is broken when object embedding is ' 'used! Apparently MontyEncoding is broken...') #Test molecule molnl = StructureNL(self.mol, self.hulk, references=self.pmg) b = StructureNL.from_dict(molnl.as_dict()) self.assertEqual(molnl, b)
def prep(ctx, archive, authors): """prep structures from an archive for submission""" run = ctx.obj["RUN"] collections = ctx.obj["COLLECTIONS"] snl_collection = ctx.obj["CLIENT"].db.snls handler = ctx.obj["MONGO_HANDLER"] nmax = ctx.obj["NMAX"] skip = ctx.obj["SKIP"] # TODO no_dupe_check flag fname, ext = os.path.splitext(os.path.basename(archive)) tag, sec_ext = fname.rsplit(".", 1) if "." in fname else [fname, ""] logger.info(click.style(f"tag: {tag}", fg="cyan")) if sec_ext: ext = "".join([sec_ext, ext]) exts = ["tar.gz", ".tgz", "bson.gz", ".zip"] if ext not in exts: raise EmmetCliError( f"{ext} not supported (yet)! Please use one of {exts}.") meta = {"authors": [Author.parse_author(a) for a in authors]} references = meta.get("references", "").strip() source_ids_scanned = handler.collection.distinct("source_id", {"tags": tag}) # TODO add archive of StructureNL files input_structures, source_total = [], None if ext == "bson.gz": input_bson = gzip.open(archive) source_total = count_file_documents(input_bson) for doc in bson.decode_file_iter(input_bson): if len(input_structures) >= nmax: break if skip and doc["db_id"] in source_ids_scanned: continue elements = set([ specie["element"] for site in doc["structure"]["sites"] for specie in site["species"] ]) for l in SETTINGS.skip_labels: if l in elements: logger.log( logging.ERROR if run else logging.INFO, f'Skip structure {doc["db_id"]}: unsupported element {l}!', extra={ "tags": [tag], "source_id": doc["db_id"] }, ) break else: s = TransformedStructure.from_dict(doc["structure"]) s.source_id = doc["db_id"] input_structures.append(s) elif ext == ".zip": input_zip = ZipFile(archive) namelist = input_zip.namelist() source_total = len(namelist) for fname in namelist: if len(input_structures) >= nmax: break if skip and fname in source_ids_scanned: continue contents = input_zip.read(fname) fmt = get_format(fname) s = Structure.from_str(contents, fmt=fmt) s.source_id = fname input_structures.append(s) else: tar = tarfile.open(archive, "r:gz") members = tar.getmembers() source_total = len(members) for member in members: if os.path.basename(member.name).startswith("."): continue if len(input_structures) >= nmax: break fname = member.name.lower() if skip and fname in source_ids_scanned: continue f = tar.extractfile(member) if f: contents = f.read().decode("utf-8") fmt = get_format(fname) s = Structure.from_str(contents, fmt=fmt) s.source_id = fname input_structures.append(s) total = len(input_structures) logger.info( f"{total} of {source_total} structure(s) loaded " f"({len(source_ids_scanned)} unique structures already scanned).") save_logs(ctx) snls, index = [], None for istruct in input_structures: # number of log messages equals number of structures processed if --run # only logger.warning goes to DB if --run if run and len(handler.buffer) >= handler.buffer_size: insert_snls(ctx, snls) struct = (istruct.final_structure if isinstance( istruct, TransformedStructure) else istruct) struct.remove_oxidation_states() struct = struct.get_primitive_structure() formula = struct.composition.reduced_formula sg = get_sg(struct) if not (struct.is_ordered and struct.is_valid()): logger.log( logging.WARNING if run else logging.INFO, f"Skip structure {istruct.source_id}: disordered or invalid!", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, }, ) continue for full_name, coll in collections.items(): # load canonical structures in collection for current formula and # duplicate-check them against current structure load_canonical_structures(ctx, full_name, formula) for canonical_structure in canonical_structures[full_name][ formula].get(sg, []): if structures_match(struct, canonical_structure): logger.log( logging.WARNING if run else logging.INFO, f"Duplicate for {istruct.source_id} ({formula}/{sg}): {canonical_structure.id}", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, "duplicate_dbname": full_name, "duplicate_id": canonical_structure.id, }, ) break else: continue # no duplicate found -> continue to next collection break # duplicate found else: # no duplicates in any collection prefix = snl_collection.database.name if index is None: # get start index for SNL id snl_ids = snl_collection.distinct("snl_id") index = max( [int(snl_id[len(prefix) + 1:]) for snl_id in snl_ids]) index += 1 snl_id = "{}-{}".format(prefix, index) kwargs = {"references": references, "projects": [tag]} if isinstance(istruct, TransformedStructure): snl = istruct.to_snl(meta["authors"], **kwargs) else: snl = StructureNL(istruct, meta["authors"], **kwargs) snl_dct = snl.as_dict() snl_dct.update(get_meta_from_structure(struct)) snl_dct["snl_id"] = snl_id snls.append(snl_dct) logger.log( logging.WARNING if run else logging.INFO, f"SNL {snl_id} created for {istruct.source_id} ({formula}/{sg})", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, }, ) # final save if run: insert_snls(ctx, snls)
filename = stoich + "_" + energy_order_prefix + "_" + unique_id filename += ".json" atoms = row_i.atoms struct = AseAtomsAdaptor().get_structure(atoms) extra_data = { "_MPContribs_Internal_ID": unique_id, } struct_NL = StructureNL( struct, authors, projects=None, references="", remarks=remarks, data=extra_data, # history=extra_data, created_at=date, ) path_i = os.path.join("out_data", filename) with open(path_i,"w") as file: json.dump( struct_NL.as_dict(), file, indent=2, )
def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas): # noqa: C901 process = multiprocessing.current_process() name = process.name chunk_idx = int(name.rsplit("-")[1]) - 1 logger.info(f"{name} starting.") tags = [tag, SETTINGS.year_tags[-1]] ctx = click.get_current_context() spec_or_dbfile = ctx.parent.parent.params["spec_or_dbfile"] target = calcdb_from_mgrant(spec_or_dbfile) snl_collection = target.db.snls_user sbxn = list(filter(None, target.collection.distinct("sbxn"))) logger.info(f"Using sandboxes {sbxn}.") no_dupe_check = ctx.parent.parent.params["no_dupe_check"] run = ctx.parent.parent.params["run"] projection = {"tags": 1, "task_id": 1} count = 0 drone = VaspDrone( additional_fields={"tags": tags}, store_volumetric_data=ctx.params["store_volumetric_data"], ) for vaspdir in vaspdirs: logger.info(f"{name} VaspDir: {vaspdir}") launcher = get_subdir(vaspdir) query = {"dir_name": {"$regex": launcher}} docs = list( target.collection.find(query, projection).sort([("_id", -1)]).limit(1)) if docs: if no_dupe_check: logger.warning(f"FORCING re-parse of {launcher}!") else: if run: shutil.rmtree(vaspdir) logger.warning( f"{name} {launcher} already parsed -> removed.") else: logger.warning( f"{name} {launcher} already parsed -> would remove.") continue try: task_doc = drone.assimilate(vaspdir) except Exception as ex: logger.error(f"Failed to assimilate {vaspdir}: {ex}") continue task_doc["sbxn"] = sbxn manual_taskid = isinstance(task_ids, dict) snl_metas_avail = isinstance(snl_metas, dict) task_id = task_ids[launcher] if manual_taskid else task_ids[chunk_idx][ count] task_doc["task_id"] = task_id logger.info(f"Using {task_id} for {launcher}.") if docs: # make sure that task gets the same tags as the previously parsed task # (run through set to implicitly remove duplicate tags) if docs[0]["tags"]: existing_tags = list(set(docs[0]["tags"])) task_doc["tags"] += existing_tags logger.info(f"Adding existing tags {existing_tags} to {tags}.") snl_dct = None if snl_metas_avail: snl_meta = snl_metas.get(launcher) if snl_meta: references = snl_meta.get("references") authors = snl_meta.get( "authors", ["Materials Project <*****@*****.**>"]) kwargs = {"projects": [tag]} if references: kwargs["references"] = references struct = Structure.from_dict(task_doc["input"]["structure"]) snl = StructureNL(struct, authors, **kwargs) snl_dct = snl.as_dict() snl_dct.update(get_meta_from_structure(struct)) snl_id = snl_meta["snl_id"] snl_dct["snl_id"] = snl_id logger.info(f"Created SNL object for {snl_id}.") if run: if task_doc["state"] == "successful": if docs and no_dupe_check: target.collection.remove({"task_id": task_id}) logger.warning( f"Removed previously parsed task {task_id}!") try: target.insert_task(task_doc, use_gridfs=True) except DocumentTooLarge: output = dotty(task_doc["calcs_reversed"][0]["output"]) pop_keys = [ "normalmode_eigenvecs", "force_constants", "outcar.onsite_density_matrices", ] for k in pop_keys: if k not in output: continue logger.warning(f"{name} Remove {k} and retry ...") output.pop(k) try: target.insert_task(task_doc, use_gridfs=True) break except DocumentTooLarge: continue else: logger.warning( f"{name} failed to reduce document size") continue if target.collection.count(query): if snl_dct: result = snl_collection.insert_one(snl_dct) logger.info( f"SNL {result.inserted_id} inserted into {snl_collection.full_name}." ) shutil.rmtree(vaspdir) logger.info( f"{name} Successfully parsed and removed {launcher}.") count += 1 else: count += 1 return count
def assimilate(self, path, dbhost='localhost', dbport=27017, dbname='ICSD', collection_name='ICSD_files', store_mongo=True): """ Assimilate data in a directory path into a pymatgen object. Because of the quirky nature of Python"s multiprocessing, the object must support pymatgen's as_dict() for parallel processing. Args: path: directory path Returns: An assimilated object """ if store_mongo: client = MongoClient(dbhost, dbport) db = client[dbname] col = db[collection_name] data = {} files = os.listdir(path) file_ID = path.split('/')[-1] print(file_ID) data['icsd_id'] = int(file_ID) #data['cifwarnings'] = [] cif_path = os.path.join(path, file_ID + '.cif') # capture any warnings generated by parsing cif file with warnings.catch_warnings(record=True) as w: cif_parser = CifParser(cif_path) for warn in w: if 'cifwarnings' in data: data['cifwarnings'].append(str(warn.message)) else: data['cifwarnings'] = [str(warn.message)] logger.warning('{}: {}'.format(file_ID, warn.message)) cif_dict = cif_parser.as_dict() orig_id = list(cif_dict.keys())[0] easy_dict = cif_dict[orig_id] if '_chemical_name_mineral' in easy_dict: data['min_name'] = easy_dict['_chemical_name_mineral'] if '_chemical_name_systematic' in easy_dict: data['chem_name'] = easy_dict['_chemical_name_systematic'] if '_cell_measurement_pressure' in easy_dict: data['pressure'] = float( easy_dict['_cell_measurement_pressure']) / 1000 else: data['pressure'] = .101325 with warnings.catch_warnings(record=True) as w: try: struc = cif_parser.get_structures()[0] except ValueError as err: # if cif parsing raises error, write icsd_id to Error_Record and do NOT add structure to mongo database logger.error( file_ID + ': {}'.format(err) + "\nDid not insert structure into Mongo Collection") with open('Error_Record', 'a') as err_rec: err_rec.write(str(file_ID) + ': {}\n'.format(err)) err_rec.close() else: authors = 'Donny Winston<*****@*****.**>, Joseph Palakapilly<*****@*****.**>' references = self.bibtex_from_cif(cif_path) history = [{ 'name': 'ICSD', 'url': 'https://icsd.fiz-karlsruhe.de/', 'description': { 'icsd_id': file_ID } }] snl = StructureNL(struc, authors=authors, references=references, history=history) data['snl'] = snl.as_dict() meta = get_meta_from_structure(struc) data['nsites'] = meta['nsites'] data['elements'] = meta['elements'] data['nelements'] = meta['nelements'] data['formula'] = meta['formula'] data['formula_reduced'] = meta['formula_pretty'] data['formula_reduced_abc'] = meta['formula_reduced_abc'] data['formula_anonymous'] = meta['formula_anonymous'] data['chemsys'] = meta['chemsys'] data['is_valid'] = meta['is_valid'] data['is_ordered'] = meta['is_ordered'] #unfortunately any warnings are logged after any errors. Not too big of an issue for warn in w: if 'cifwarnings' in data: data['cifwarnings'].append(str(warn.message)) else: data['cifwarnings'] = [str(warn.message)] logger.warning('{}: {}'.format(file_ID, warn.message)) if 'snl' in data: if store_mongo: col.update_one({'icsd_id': int(file_ID)}, {'$set': data}, upsert=True) return data