def test_from_structures(self): s1 = Structure([[5, 0, 0], [0, 5, 0], [0, 0, 5]], ["Fe"], [[0, 0, 0]]) s2 = Structure([[5, 0, 0], [0, 5, 0], [0, 0, 5]], ["Mn"], [[0, 0, 0]]) remarks = ["unittest"] authors="Test User <*****@*****.**>" snl_list = StructureNL.from_structures([s1, s2], authors, remarks=remarks) self.assertEqual(len(snl_list), 2) snl1 = snl_list[0] snl2 = snl_list[1] self.assertEqual(snl1.remarks, remarks) self.assertEqual(snl2.remarks, remarks) self.assertEqual(snl1.authors, [Author.parse_author(authors)]) self.assertEqual(snl2.authors, [Author.parse_author(authors)])
def test_to_from_dict(self): # no complicated objects in the 'data' or 'nodes' field a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg, ['remark1'], {"_my_data": "string"}, [self.valid_node, self.valid_node2]) b = StructureNL.from_dict(a.as_dict()) self.assertEqual(a, b) # complicated objects in the 'data' and 'nodes' field complicated_node = {"name": "complicated node", "url": "www.complicatednodegoeshere.com", "description": {"structure": self.s2}} a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg, ['remark1'], {"_my_data": {"structure": self.s2}}, [complicated_node, self.valid_node]) b = StructureNL.from_dict(a.as_dict()) self.assertEqual(a, b, 'to/from dict is broken when object embedding is ' 'used! Apparently MontyEncoding is broken...') #Test molecule molnl = StructureNL(self.mol, self.hulk, references=self.pmg) b = StructureNL.from_dict(molnl.as_dict()) self.assertEqual(molnl, b)
def submit_structures(self, structures, authors, projects=None, references='', remarks=None, data=None, histories=None, created_at=None): """ Submits a list of structures to the Materials Project as SNL files. The argument list mirrors the arguments for the StructureNL object, except that a list of structures with the same metadata is used as an input. .. note:: As of now, this MP REST feature is open only to a select group of users. Opening up submissions to all users is being planned for the future. Args: structures: A list of Structure objects authors (list): List of {"name":'', "email":''} dicts, *list* of Strings as 'John Doe <*****@*****.**>', or a single String with commas separating authors projects ([str]): List of Strings ['Project A', 'Project B']. This applies to all structures. references (str): A String in BibTeX format. Again, this applies to all structures. remarks ([str]): List of Strings ['Remark A', 'Remark B'] data ([dict]): A list of free form dict. Namespaced at the root level with an underscore, e.g. {"_materialsproject":<custom data>}. The length of data should be the same as the list of structures if not None. histories: List of list of dicts - [[{'name':'', 'url':'', 'description':{}}], ...] The length of histories should be the same as the list of structures if not None. created_at (datetime): A datetime object Returns: A list of inserted submission ids. """ from pymatgen.util.provenance import StructureNL snl_list = StructureNL.from_structures(structures, authors, projects, references, remarks, data, histories, created_at) self.submit_snl(snl_list)
def test_snl(self): self.trans.set_parameter("author", "will") with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") snl = self.trans.to_snl([("will", "*****@*****.**")]) self.assertEqual( len(w), 1, "Warning not raised on type conversion with other_parameters", ) ts = TransformedStructure.from_snl(snl) self.assertEqual(ts.history[-1]["@class"], "SubstitutionTransformation") h = ("testname", "testURL", {"test": "testing"}) snl = StructureNL(ts.final_structure, [("will", "*****@*****.**")], history=[h]) snl = TransformedStructure.from_snl(snl).to_snl([("notwill", "*****@*****.**")]) self.assertEqual(snl.history, [h]) self.assertEqual(snl.authors, [("notwill", "*****@*****.**")])
def add_snl(mat, new_style_mat): snl = new_style_mat.get("snl", None) mat["snl"] = copy.deepcopy(mat["structure"]) if snl: mat["snl"].update(snl) else: mat["snl"] = StructureNL(Structure.from_dict(mat["structure"]), []).as_dict() mat["snl"]["about"].update(mp_default_snl_fields) mat["snl_final"] = mat["snl"] mat["icsd_ids"] = [int(i) for i in get(mat["snl"], "about._db_ids.icsd_ids", [])] mat["pf_ids"] = get(mat["snl"], "about._db_ids.pf_ids", []) # Extract tags from remarks by looking for just nounds and adjectives mat["exp"] = {"tags": []} for remark in mat["snl"]["about"].get("_tags", []): tokens = set( tok[1] for tok in nltk.pos_tag(nltk.word_tokenize(remark), tagset="universal") ) if len(tokens.intersection({"ADV", "ADP", "VERB"})) == 0: mat["exp"]["tags"].append(remark)
def to_snl(self, authors, **kwargs) -> StructureNL: """ Generate SNL from TransformedStructure. :param authors: List of authors :param **kwargs: All kwargs supported by StructureNL. :return: StructureNL """ if self.other_parameters: warn("Data in TransformedStructure.other_parameters discarded during type conversion to SNL") hist = [] for h in self.history: snl_metadata = h.pop("_snl", {}) hist.append( { "name": snl_metadata.pop("name", "pymatgen"), "url": snl_metadata.pop("url", "http://pypi.python.org/pypi/pymatgen"), "description": h, } ) return StructureNL(self.final_structure, authors, history=hist, **kwargs)
def match(self, snls, mat): """ Finds a material doc that matches with the given snl Args: snl ([dict]): the snls list mat (dict): a materials doc Returns: generator of materials doc keys """ sm = StructureMatcher(ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol, primitive_cell=True, scale=True, attempt_supercell=False, allow_subset=False, comparator=ElementComparator()) m_strucs = [Structure.from_dict(mat["structure"])] + [ Structure.from_dict(init_struc) for init_struc in mat["initial_structures"] ] for snl in snls: snl_struc = StructureNL.from_dict(snl).structure try: snl_spacegroup = snl_struc.get_space_group_info()[0] except: snl_spacegroup = -1 for struc in m_strucs: try: struc_sg = struc.get_space_group_info()[0] except: struc_sg = -1 # The try-excepts are a temp fix to a spglib bug if struc_sg == snl_spacegroup and sm.fit(struc, snl_struc): yield snl break
def match(self, snl, mats): """ Finds a material doc that matches with the given snl Args: snl (dict): the snl doc mats ([dict]): the materials docs to match against Returns: dict: a materials doc if one is found otherwise returns None """ sm = StructureMatcher(ltol=self.ltol, stol=self.stol, angle_tol=self.angle_tol, primitive_cell=True, scale=True, attempt_supercell=False, allow_subset=False, comparator=ElementComparator()) snl_struc = StructureNL.from_dict(snl).structure for m in mats: m_struct = Structure.from_dict(m["structure"]) init_m_struct = Structure.from_dict(m["initial_structure"]) if sm.fit(m_struct, snl_struc) or sm.fit(init_m_struct, snl_struc): return m[self.materials.key] return None
def prep(ctx, archive, authors): """prep structures from an archive for submission""" run = ctx.obj["RUN"] collections = ctx.obj["COLLECTIONS"] snl_collection = ctx.obj["CLIENT"].db.snls handler = ctx.obj["MONGO_HANDLER"] nmax = ctx.obj["NMAX"] skip = ctx.obj["SKIP"] # TODO no_dupe_check flag fname, ext = os.path.splitext(os.path.basename(archive)) tag, sec_ext = fname.rsplit(".", 1) if "." in fname else [fname, ""] logger.info(click.style(f"tag: {tag}", fg="cyan")) if sec_ext: ext = "".join([sec_ext, ext]) exts = ["tar.gz", ".tgz", "bson.gz", ".zip"] if ext not in exts: raise EmmetCliError( f"{ext} not supported (yet)! Please use one of {exts}.") meta = {"authors": [Author.parse_author(a) for a in authors]} references = meta.get("references", "").strip() source_ids_scanned = handler.collection.distinct("source_id", {"tags": tag}) # TODO add archive of StructureNL files input_structures, source_total = [], None if ext == "bson.gz": input_bson = gzip.open(archive) source_total = count_file_documents(input_bson) for doc in bson.decode_file_iter(input_bson): if len(input_structures) >= nmax: break if skip and doc["db_id"] in source_ids_scanned: continue elements = set([ specie["element"] for site in doc["structure"]["sites"] for specie in site["species"] ]) for l in SETTINGS.skip_labels: if l in elements: logger.log( logging.ERROR if run else logging.INFO, f'Skip structure {doc["db_id"]}: unsupported element {l}!', extra={ "tags": [tag], "source_id": doc["db_id"] }, ) break else: s = TransformedStructure.from_dict(doc["structure"]) s.source_id = doc["db_id"] input_structures.append(s) elif ext == ".zip": input_zip = ZipFile(archive) namelist = input_zip.namelist() source_total = len(namelist) for fname in namelist: if len(input_structures) >= nmax: break if skip and fname in source_ids_scanned: continue contents = input_zip.read(fname) fmt = get_format(fname) s = Structure.from_str(contents, fmt=fmt) s.source_id = fname input_structures.append(s) else: tar = tarfile.open(archive, "r:gz") members = tar.getmembers() source_total = len(members) for member in members: if os.path.basename(member.name).startswith("."): continue if len(input_structures) >= nmax: break fname = member.name.lower() if skip and fname in source_ids_scanned: continue f = tar.extractfile(member) if f: contents = f.read().decode("utf-8") fmt = get_format(fname) s = Structure.from_str(contents, fmt=fmt) s.source_id = fname input_structures.append(s) total = len(input_structures) logger.info( f"{total} of {source_total} structure(s) loaded " f"({len(source_ids_scanned)} unique structures already scanned).") save_logs(ctx) snls, index = [], None for istruct in input_structures: # number of log messages equals number of structures processed if --run # only logger.warning goes to DB if --run if run and len(handler.buffer) >= handler.buffer_size: insert_snls(ctx, snls) struct = (istruct.final_structure if isinstance( istruct, TransformedStructure) else istruct) struct.remove_oxidation_states() struct = struct.get_primitive_structure() formula = struct.composition.reduced_formula sg = get_sg(struct) if not (struct.is_ordered and struct.is_valid()): logger.log( logging.WARNING if run else logging.INFO, f"Skip structure {istruct.source_id}: disordered or invalid!", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, }, ) continue for full_name, coll in collections.items(): # load canonical structures in collection for current formula and # duplicate-check them against current structure load_canonical_structures(ctx, full_name, formula) for canonical_structure in canonical_structures[full_name][ formula].get(sg, []): if structures_match(struct, canonical_structure): logger.log( logging.WARNING if run else logging.INFO, f"Duplicate for {istruct.source_id} ({formula}/{sg}): {canonical_structure.id}", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, "duplicate_dbname": full_name, "duplicate_id": canonical_structure.id, }, ) break else: continue # no duplicate found -> continue to next collection break # duplicate found else: # no duplicates in any collection prefix = snl_collection.database.name if index is None: # get start index for SNL id snl_ids = snl_collection.distinct("snl_id") index = max( [int(snl_id[len(prefix) + 1:]) for snl_id in snl_ids]) index += 1 snl_id = "{}-{}".format(prefix, index) kwargs = {"references": references, "projects": [tag]} if isinstance(istruct, TransformedStructure): snl = istruct.to_snl(meta["authors"], **kwargs) else: snl = StructureNL(istruct, meta["authors"], **kwargs) snl_dct = snl.as_dict() snl_dct.update(get_meta_from_structure(struct)) snl_dct["snl_id"] = snl_id snls.append(snl_dct) logger.log( logging.WARNING if run else logging.INFO, f"SNL {snl_id} created for {istruct.source_id} ({formula}/{sg})", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, }, ) # final save if run: insert_snls(ctx, snls)
def _get_snls_from_resource(json, url, identifier) -> Dict[str, StructureNL]: snls = {} exceptions = set() def _sanitize_symbol(symbol): if symbol == "vacancy": symbol = DummySpecies("X_vacancy", oxidation_state=None) elif symbol == "X": symbol = DummySpecies("X", oxidation_state=None) return symbol def _get_comp(sp_dict): return { _sanitize_symbol(symbol): conc for symbol, conc in zip(sp_dict["chemical_symbols"], sp_dict["concentration"]) } for data in json["data"]: # TODO: check the spec! and remove this try/except (are all providers following spec?) # e.g. can check data["type"] == "structures" try: # e.g. COD structure = Structure( lattice=data["attributes"]["lattice_vectors"], species=[_get_comp(d) for d in data["attributes"]["species"]], coords=data["attributes"]["cartesian_site_positions"], coords_are_cartesian=True, ) # Grab any custom fields or non-mandatory fields if they were requested namespaced_data = { k: v for k, v in data["attributes"].items() if k.startswith("_") or k not in {"lattice_vectors", "species", "cartesian_site_positions"} } # TODO: follow `references` to add reference information here snl = StructureNL( structure, authors={}, history=[{"name": identifier, "url": url, "description": {"id": data["id"]}}], data={"_optimade": namespaced_data}, ) snls[data["id"]] = snl # TODO: bare exception, remove... except Exception: try: # e.g. MP (all ordered, no vacancies) structure = Structure( lattice=data["attributes"]["lattice_vectors"], species=data["attributes"]["species_at_sites"], coords=data["attributes"]["cartesian_site_positions"], coords_are_cartesian=True, ) # Grab any custom fields or non-mandatory fields if they were requested namespaced_data = { k: v for k, v in data["attributes"].items() if k.startswith("_") or k not in {"lattice_vectors", "species", "cartesian_site_positions"} } # TODO: follow `references` to add reference information here snl = StructureNL( structure, authors={}, history=[{"name": identifier, "url": url, "description": {"id": data["id"]}}], data={"_optimade": namespaced_data}, ) snls[data["id"]] = snl except Exception as exc: if str(exc) not in exceptions: exceptions.add(str(exc)) if exceptions: _logger.error(f'Failed to parse returned data for {url}: {", ".join(exceptions)}') return snls
def submit_snl(n_clicks, structure, comments, url): if not n_clicks: raise PreventUpdate token = parse_token(url) if not token: raise PreventUpdate structure = self.from_data(structure) if type(structure) != Structure: message = ( f"Can only submit structures to Materials Project, " f"not {type(structure)}" ) return MessageContainer(message, kind="warning") if not MP_CLIENT_KEY: message = ( f"Submission to MPComplete is currently disabled, " f"please check back soon or contact @mkhorton." ) return MessageContainer(message, kind="warning") # check if structure already exists on MP with MPRester() as mpr: mpids = mpr.find_structure(structure) if mpids: message = ( f"Similar structures are already available on " f"the Materials Project, see: {', '.join(mpids)}" ) return MessageContainer(message, kind="warning") remarks = [ f"Generated by Crystal Toolkit {ct_version} and " f"submitted with MPComplete" ] if comments: remarks.append(comments) contents = get_token_response(token) user_name = f"{contents['first_name']} {contents['last_name']}" user_email = contents["email"] user_api_key = contents["api_key"] snl = StructureNL( structure, [{"name": user_name, "email": user_email}], remarks=remarks ) with MPRester( user_api_key, endpoint="https://www.materialsproject.org/rest/v1" ) as mpr: try: submission_response = mpr.submit_snl(snl) except Exception as exc: return MessageContainer(str(exc), kind="warning") header = f"Structure submission status: {submission_response[0]['status']}" message = submission_response[0]["details"] return MessageContainer( [MessageHeader(header), MessageBody(message)], kind="info" )
def test_to_from_dict(self): # no complicated objects in the 'data' or 'nodes' field a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg, ['remark1'], {"_my_data": "string"}, [self.valid_node, self.valid_node2]) b = StructureNL.from_dict(a.as_dict()) self.assertEqual(a, b) # complicated objects in the 'data' and 'nodes' field complicated_node = { "name": "complicated node", "url": "www.complicatednodegoeshere.com", "description": { "structure": self.s2 } } a = StructureNL(self.s, self.hulk, ['test_project'], self.pmg, ['remark1'], {"_my_data": { "structure": self.s2 }}, [complicated_node, self.valid_node]) b = StructureNL.from_dict(a.as_dict()) self.assertEqual( a, b, 'to/from dict is broken when object embedding is ' 'used! Apparently MontyEncoding is broken...') # Test molecule molnl = StructureNL(self.mol, self.hulk, references=self.pmg) b = StructureNL.from_dict(molnl.as_dict()) self.assertEqual(molnl, b)
filename = stoich + "_" + energy_order_prefix + "_" + unique_id filename += ".json" atoms = row_i.atoms struct = AseAtomsAdaptor().get_structure(atoms) extra_data = { "_MPContribs_Internal_ID": unique_id, } struct_NL = StructureNL( struct, authors, projects=None, references="", remarks=remarks, data=extra_data, # history=extra_data, created_at=date, ) path_i = os.path.join("out_data", filename) with open(path_i,"w") as file: json.dump( struct_NL.as_dict(), file, indent=2, )
def parse_vasp_dirs(vaspdirs, tag, task_ids, snl_metas): # noqa: C901 process = multiprocessing.current_process() name = process.name chunk_idx = int(name.rsplit("-")[1]) - 1 logger.info(f"{name} starting.") tags = [tag, SETTINGS.year_tags[-1]] ctx = click.get_current_context() spec_or_dbfile = ctx.parent.parent.params["spec_or_dbfile"] target = calcdb_from_mgrant(spec_or_dbfile) snl_collection = target.db.snls_user sbxn = list(filter(None, target.collection.distinct("sbxn"))) logger.info(f"Using sandboxes {sbxn}.") no_dupe_check = ctx.parent.parent.params["no_dupe_check"] run = ctx.parent.parent.params["run"] projection = {"tags": 1, "task_id": 1} count = 0 drone = VaspDrone( additional_fields={"tags": tags}, store_volumetric_data=ctx.params["store_volumetric_data"], ) for vaspdir in vaspdirs: logger.info(f"{name} VaspDir: {vaspdir}") launcher = get_subdir(vaspdir) query = {"dir_name": {"$regex": launcher}} docs = list( target.collection.find(query, projection).sort([("_id", -1)]).limit(1)) if docs: if no_dupe_check: logger.warning(f"FORCING re-parse of {launcher}!") else: if run: shutil.rmtree(vaspdir) logger.warning( f"{name} {launcher} already parsed -> removed.") else: logger.warning( f"{name} {launcher} already parsed -> would remove.") continue try: task_doc = drone.assimilate(vaspdir) except Exception as ex: logger.error(f"Failed to assimilate {vaspdir}: {ex}") continue task_doc["sbxn"] = sbxn manual_taskid = isinstance(task_ids, dict) snl_metas_avail = isinstance(snl_metas, dict) task_id = task_ids[launcher] if manual_taskid else task_ids[chunk_idx][ count] task_doc["task_id"] = task_id logger.info(f"Using {task_id} for {launcher}.") if docs: # make sure that task gets the same tags as the previously parsed task # (run through set to implicitly remove duplicate tags) if docs[0]["tags"]: existing_tags = list(set(docs[0]["tags"])) task_doc["tags"] += existing_tags logger.info(f"Adding existing tags {existing_tags} to {tags}.") snl_dct = None if snl_metas_avail: snl_meta = snl_metas.get(launcher) if snl_meta: references = snl_meta.get("references") authors = snl_meta.get( "authors", ["Materials Project <*****@*****.**>"]) kwargs = {"projects": [tag]} if references: kwargs["references"] = references struct = Structure.from_dict(task_doc["input"]["structure"]) snl = StructureNL(struct, authors, **kwargs) snl_dct = snl.as_dict() snl_dct.update(get_meta_from_structure(struct)) snl_id = snl_meta["snl_id"] snl_dct["snl_id"] = snl_id logger.info(f"Created SNL object for {snl_id}.") if run: if task_doc["state"] == "successful": if docs and no_dupe_check: target.collection.remove({"task_id": task_id}) logger.warning( f"Removed previously parsed task {task_id}!") try: target.insert_task(task_doc, use_gridfs=True) except DocumentTooLarge: output = dotty(task_doc["calcs_reversed"][0]["output"]) pop_keys = [ "normalmode_eigenvecs", "force_constants", "outcar.onsite_density_matrices", ] for k in pop_keys: if k not in output: continue logger.warning(f"{name} Remove {k} and retry ...") output.pop(k) try: target.insert_task(task_doc, use_gridfs=True) break except DocumentTooLarge: continue else: logger.warning( f"{name} failed to reduce document size") continue if target.collection.count(query): if snl_dct: result = snl_collection.insert_one(snl_dct) logger.info( f"SNL {result.inserted_id} inserted into {snl_collection.full_name}." ) shutil.rmtree(vaspdir) logger.info( f"{name} Successfully parsed and removed {launcher}.") count += 1 else: count += 1 return count
def assimilate(self, path, dbhost='localhost', dbport=27017, dbname='ICSD', collection_name='ICSD_files', store_mongo=True): """ Assimilate data in a directory path into a pymatgen object. Because of the quirky nature of Python"s multiprocessing, the object must support pymatgen's as_dict() for parallel processing. Args: path: directory path Returns: An assimilated object """ if store_mongo: client = MongoClient(dbhost, dbport) db = client[dbname] col = db[collection_name] data = {} files = os.listdir(path) file_ID = path.split('/')[-1] print(file_ID) data['icsd_id'] = int(file_ID) #data['cifwarnings'] = [] cif_path = os.path.join(path, file_ID + '.cif') # capture any warnings generated by parsing cif file with warnings.catch_warnings(record=True) as w: cif_parser = CifParser(cif_path) for warn in w: if 'cifwarnings' in data: data['cifwarnings'].append(str(warn.message)) else: data['cifwarnings'] = [str(warn.message)] logger.warning('{}: {}'.format(file_ID, warn.message)) cif_dict = cif_parser.as_dict() orig_id = list(cif_dict.keys())[0] easy_dict = cif_dict[orig_id] if '_chemical_name_mineral' in easy_dict: data['min_name'] = easy_dict['_chemical_name_mineral'] if '_chemical_name_systematic' in easy_dict: data['chem_name'] = easy_dict['_chemical_name_systematic'] if '_cell_measurement_pressure' in easy_dict: data['pressure'] = float( easy_dict['_cell_measurement_pressure']) / 1000 else: data['pressure'] = .101325 with warnings.catch_warnings(record=True) as w: try: struc = cif_parser.get_structures()[0] except ValueError as err: # if cif parsing raises error, write icsd_id to Error_Record and do NOT add structure to mongo database logger.error( file_ID + ': {}'.format(err) + "\nDid not insert structure into Mongo Collection") with open('Error_Record', 'a') as err_rec: err_rec.write(str(file_ID) + ': {}\n'.format(err)) err_rec.close() else: authors = 'Donny Winston<*****@*****.**>, Joseph Palakapilly<*****@*****.**>' references = self.bibtex_from_cif(cif_path) history = [{ 'name': 'ICSD', 'url': 'https://icsd.fiz-karlsruhe.de/', 'description': { 'icsd_id': file_ID } }] snl = StructureNL(struc, authors=authors, references=references, history=history) data['snl'] = snl.as_dict() meta = get_meta_from_structure(struc) data['nsites'] = meta['nsites'] data['elements'] = meta['elements'] data['nelements'] = meta['nelements'] data['formula'] = meta['formula'] data['formula_reduced'] = meta['formula_pretty'] data['formula_reduced_abc'] = meta['formula_reduced_abc'] data['formula_anonymous'] = meta['formula_anonymous'] data['chemsys'] = meta['chemsys'] data['is_valid'] = meta['is_valid'] data['is_ordered'] = meta['is_ordered'] #unfortunately any warnings are logged after any errors. Not too big of an issue for warn in w: if 'cifwarnings' in data: data['cifwarnings'].append(str(warn.message)) else: data['cifwarnings'] = [str(warn.message)] logger.warning('{}: {}'.format(file_ID, warn.message)) if 'snl' in data: if store_mongo: col.update_one({'icsd_id': int(file_ID)}, {'$set': data}, upsert=True) return data
def test_remarks(self): a = StructureNL(self.s, self.hulk, remarks="string format") self.assertEqual(a.remarks[0], "string format") self.assertRaises(ValueError, StructureNL, self.s, self.hulk, remarks=self.remark_fail)