def test_from_structures(self): s1 = Structure([[5, 0, 0], [0, 5, 0], [0, 0, 5]], ["Fe"], [[0, 0, 0]]) s2 = Structure([[5, 0, 0], [0, 5, 0], [0, 0, 5]], ["Mn"], [[0, 0, 0]]) remarks = ["unittest"] authors="Test User <*****@*****.**>" snl_list = StructureNL.from_structures([s1, s2], authors, remarks=remarks) self.assertEqual(len(snl_list), 2) snl1 = snl_list[0] snl2 = snl_list[1] self.assertEqual(snl1.remarks, remarks) self.assertEqual(snl2.remarks, remarks) self.assertEqual(snl1.authors, [Author.parse_author(authors)]) self.assertEqual(snl2.authors, [Author.parse_author(authors)])
def snls(structure): docs = [ StructureNL( structure, authors=[Author("test{i}", "*****@*****.**").as_dict()], history=[HistoryNode("nothing", "url.com", {})], created_at=datetime.utcnow(), ).as_dict() for i in range(3) ] docs[0]["snl_id"] = "icsd-2" docs[1]["snl_id"] = "user-1" docs[2]["snl_id"] = "pf-3" return docs
def prep(ctx, archive, authors): """prep structures from an archive for submission""" run = ctx.obj["RUN"] collections = ctx.obj["COLLECTIONS"] snl_collection = ctx.obj["CLIENT"].db.snls handler = ctx.obj["MONGO_HANDLER"] nmax = ctx.obj["NMAX"] skip = ctx.obj["SKIP"] # TODO no_dupe_check flag fname, ext = os.path.splitext(os.path.basename(archive)) tag, sec_ext = fname.rsplit(".", 1) if "." in fname else [fname, ""] logger.info(click.style(f"tag: {tag}", fg="cyan")) if sec_ext: ext = "".join([sec_ext, ext]) exts = ["tar.gz", ".tgz", "bson.gz", ".zip"] if ext not in exts: raise EmmetCliError( f"{ext} not supported (yet)! Please use one of {exts}.") meta = {"authors": [Author.parse_author(a) for a in authors]} references = meta.get("references", "").strip() source_ids_scanned = handler.collection.distinct("source_id", {"tags": tag}) # TODO add archive of StructureNL files input_structures, source_total = [], None if ext == "bson.gz": input_bson = gzip.open(archive) source_total = count_file_documents(input_bson) for doc in bson.decode_file_iter(input_bson): if len(input_structures) >= nmax: break if skip and doc["db_id"] in source_ids_scanned: continue elements = set([ specie["element"] for site in doc["structure"]["sites"] for specie in site["species"] ]) for l in SETTINGS.skip_labels: if l in elements: logger.log( logging.ERROR if run else logging.INFO, f'Skip structure {doc["db_id"]}: unsupported element {l}!', extra={ "tags": [tag], "source_id": doc["db_id"] }, ) break else: s = TransformedStructure.from_dict(doc["structure"]) s.source_id = doc["db_id"] input_structures.append(s) elif ext == ".zip": input_zip = ZipFile(archive) namelist = input_zip.namelist() source_total = len(namelist) for fname in namelist: if len(input_structures) >= nmax: break if skip and fname in source_ids_scanned: continue contents = input_zip.read(fname) fmt = get_format(fname) s = Structure.from_str(contents, fmt=fmt) s.source_id = fname input_structures.append(s) else: tar = tarfile.open(archive, "r:gz") members = tar.getmembers() source_total = len(members) for member in members: if os.path.basename(member.name).startswith("."): continue if len(input_structures) >= nmax: break fname = member.name.lower() if skip and fname in source_ids_scanned: continue f = tar.extractfile(member) if f: contents = f.read().decode("utf-8") fmt = get_format(fname) s = Structure.from_str(contents, fmt=fmt) s.source_id = fname input_structures.append(s) total = len(input_structures) logger.info( f"{total} of {source_total} structure(s) loaded " f"({len(source_ids_scanned)} unique structures already scanned).") save_logs(ctx) snls, index = [], None for istruct in input_structures: # number of log messages equals number of structures processed if --run # only logger.warning goes to DB if --run if run and len(handler.buffer) >= handler.buffer_size: insert_snls(ctx, snls) struct = (istruct.final_structure if isinstance( istruct, TransformedStructure) else istruct) struct.remove_oxidation_states() struct = struct.get_primitive_structure() formula = struct.composition.reduced_formula sg = get_sg(struct) if not (struct.is_ordered and struct.is_valid()): logger.log( logging.WARNING if run else logging.INFO, f"Skip structure {istruct.source_id}: disordered or invalid!", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, }, ) continue for full_name, coll in collections.items(): # load canonical structures in collection for current formula and # duplicate-check them against current structure load_canonical_structures(ctx, full_name, formula) for canonical_structure in canonical_structures[full_name][ formula].get(sg, []): if structures_match(struct, canonical_structure): logger.log( logging.WARNING if run else logging.INFO, f"Duplicate for {istruct.source_id} ({formula}/{sg}): {canonical_structure.id}", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, "duplicate_dbname": full_name, "duplicate_id": canonical_structure.id, }, ) break else: continue # no duplicate found -> continue to next collection break # duplicate found else: # no duplicates in any collection prefix = snl_collection.database.name if index is None: # get start index for SNL id snl_ids = snl_collection.distinct("snl_id") index = max( [int(snl_id[len(prefix) + 1:]) for snl_id in snl_ids]) index += 1 snl_id = "{}-{}".format(prefix, index) kwargs = {"references": references, "projects": [tag]} if isinstance(istruct, TransformedStructure): snl = istruct.to_snl(meta["authors"], **kwargs) else: snl = StructureNL(istruct, meta["authors"], **kwargs) snl_dct = snl.as_dict() snl_dct.update(get_meta_from_structure(struct)) snl_dct["snl_id"] = snl_id snls.append(snl_dct) logger.log( logging.WARNING if run else logging.INFO, f"SNL {snl_id} created for {istruct.source_id} ({formula}/{sg})", extra={ "formula": formula, "spacegroup": sg, "tags": [tag], "source_id": istruct.source_id, }, ) # final save if run: insert_snls(ctx, snls)
def process_mpfile(path_or_mpfile, target=None, fmt='archieml', ids=None): try: if isinstance(path_or_mpfile, six.string_types) and \ not os.path.isfile(path_or_mpfile): raise Exception('{} not found'.format(path_or_mpfile)) if ids is not None and not isinstance(ids, list) and not len(ids) == 2: raise Exception('{} is not list of length 2!'.format(ids)) from pymatgen.analysis.structure_matcher import StructureMatcher mod = import_module('mpcontribs.io.{}.mpfile'.format(fmt)) MPFile = getattr(mod, 'MPFile') full_name = pwd.getpwuid(os.getuid())[4] contributor = '{} <*****@*****.**>'.format(full_name) # fake cma = ContributionMongoAdapter() axes, ov_data = set(), dict() mpfile_out, cid_shorts = MPFile(), [] # output sm = StructureMatcher(primitive_cell=False, scale=False) # split input MPFile into contributions: treat every mp_cat_id as separate DB insert mpfile_in = path_or_mpfile if isinstance(path_or_mpfile, six.string_types) or isinstance( path_or_mpfile, StringIO): mpfile_in = MPFile.from_file(path_or_mpfile) for idx, mpfile_single in enumerate(mpfile_in.split()): mp_cat_id = mpfile_single.document.keys()[0] if ids is None or mp_cat_id == ids[0]: cid = mpfile_single.document[mp_cat_id].get('cid', None) update = bool(cid is not None) if update: cid_short = get_short_object_id(cid) yield 'use #{} to update #{} ... '.format(idx, cid_short) # always run local "submission" to catch failure before interacting with DB yield 'process #{} ({}) ... '.format(idx, mp_cat_id) doc = cma.submit_contribution( mpfile_single, contributor) # does not use get_string cid = doc['_id'] cid_short = get_short_object_id(cid) if ids is None or cid_short == ids[1]: yield 'check ... ' obj_size = asizeof.asizeof(mpfile_single) / 1024. / 1024. if obj_size > 0.5: yield 'skip ({:.3f}MB) ... '.format(obj_size) else: try: mpfile_single_cmp_str = mpfile_single.get_string() except Exception as ex: yield 'get_string() FAILED!<br>' continue try: mpfile_single_cmp = MPFile.from_string( mpfile_single_cmp_str) except Exception as ex: yield 'from_string() FAILED!<br>' continue if mpfile_single.document != mpfile_single_cmp.document: yield 'check again ... ' found_inconsistency = False # check structural data structures_ok = True for name, s1 in mpfile_single.sdata[ mp_cat_id].iteritems(): s2 = mpfile_single_cmp.sdata[mp_cat_id][name] if s1 != s2: if len(s1) != len(s2): yield 'different number of sites: {} -> {}!<br>'.format( len(s1), len(s2)) structures_ok = False break if s1.lattice != s2.lattice: yield 'lattices different!<br>' structures_ok = False break for site in s1: if site not in s2: found_inconsistency = True if not sm.fit(s1, s2): yield 'structures do not match!<br>' structures_ok = False break if not structures_ok: break if not structures_ok: continue # check hierarchical and tabular data # compare json strings to find first inconsistency json_compare(mpfile_single.hdata, mpfile_single_cmp.hdata) json_compare(mpfile_single.tdata, mpfile_single_cmp.tdata) if not found_inconsistency: # documents are not equal, but all components checked, skip contribution # should not happen yield 'inconsistency found but not identified!<br>' continue if target is not None: yield 'submit ... ' cid = target.submit_contribution( mpfile_single, fmt) # uses get_string mpfile_single.insert_id(mp_cat_id, cid) cid_shorts.append(cid_short) if target is not None: if idx < 5: yield 'build ... ' url = target.build_contribution(cid) url = '/'.join([ target.preamble.rsplit('/', 1)[0], 'explorer', url ]) yield ( "OK. <a href='{}' class='btn btn-default btn-xs' " + "role='button' target='_blank'>View</a></br>" ).format(url) else: target.set_build_flag(cid, True) yield 'OK (queued).</br>' else: if (ids is None and idx < 5) or ids is not None: yield 'build ... ' mcb = MPContributionsBuilder(doc) build_doc = mcb.build(contributor, cid) else: yield 'skip ... ' from pymatgen.util.provenance import Author author = Author.parse_author(contributor) build_doc = [mp_cat_id, author.name, cid_short, ''] yield build_doc yield 'overview axes ... ' scope, local_axes = [], set() mpfile_for_axes = MPFile.from_contribution(doc) for k, v in mpfile_for_axes.hdata[mp_cat_id].iterate(): if v is None: scope = scope[:k[0]] scope.append(k[1]) else: try: if k[0] == len(scope): scope.append(k[1]) else: scope[-1] = k[1] vf = float(v) # trigger exception scope_str = '.'.join(scope) if idx == 0: axes.add(scope_str) ov_data[scope_str] = { cid_short: (vf, mp_cat_id) } else: local_axes.add(scope_str) ov_data[scope_str][cid_short] = ( vf, mp_cat_id) except: pass if idx > 0: axes.intersection_update(local_axes) yield 'OK.</br>'.format(idx, cid_short) else: yield 'wrong CID.</br>' mpfile_out.concat(mpfile_single) time.sleep(.01) ncontribs = len(cid_shorts) if target is not None: yield '<strong>{} contributions successfully submitted.</strong>'.format( ncontribs) else: for k in ov_data.keys(): if k not in axes: ov_data.pop(k) yield ov_data yield '<strong>{} contributions successfully processed.</strong>'.format( ncontribs) except: ex = sys.exc_info()[1] yield 'FAILED.</br>' yield str(ex).replace('"', "'") return
def build(self, contributor_email, cid, api_key=None, endpoint=None): """update materials/compositions collections with contributed data""" cid_short, cid_str = get_short_object_id(cid), str(cid) contrib = self.find_contribution(cid) if not contrib: raise Exception('Contribution {} not found!'.format(cid)) if contributor_email not in contrib['collaborators']: raise ValueError( "Build stopped: building contribution {} not " "allowed due to insufficient permissions of {}! Ask " "someone of {} to make you a collaborator on {}.".format( cid_short, contributor_email, contrib['collaborators'], cid_short)) from pymatgen.util.provenance import Author mpfile = MPFileCore.from_contribution(contrib) mp_cat_id = mpfile.ids[0] is_mp_id = mp_id_pattern.match(mp_cat_id) self.curr_coll = self.materials if is_mp_id else self.compositions author = Author.parse_author(contributor_email) project = str(author.name).translate(None, '.') \ if 'project' not in contrib else contrib['project'] nb = nbf.new_notebook() if isinstance(self.db, dict): contrib.pop('_id') if 'cid' in contrib['content']: contrib['content'].pop('cid') nb['cells'].append( nbf.new_code_cell( "from mpcontribs.io.core.mpfile import MPFileCore\n" "from mpcontribs.io.core.recdict import RecursiveDict\n" "mpfile = MPFileCore.from_contribution({})\n" "identifier = '{}'".format(contrib, mp_cat_id))) else: nb['cells'].append( nbf.new_code_cell( "from mpcontribs.rest.rester import MPContribsRester")) os.environ['PMG_MAPI_KEY'] = api_key os.environ['PMG_MAPI_ENDPOINT'] = endpoint nb['cells'].append( nbf.new_code_cell( "with MPContribsRester() as mpr:\n" " mpfile = mpr.find_contribution('{}')\n" " identifier = mpfile.ids[0]".format(cid))) nb['cells'].append( nbf.new_markdown_cell("## Contribution #{} for {}".format( cid_short, mp_cat_id))) nb['cells'].append(nbf.new_markdown_cell("### Hierarchical Data")) nb['cells'].append(nbf.new_code_cell("mpfile.hdata[identifier]")) if mpfile.tdata[mp_cat_id]: nb['cells'].append(nbf.new_markdown_cell("### Tabular Data")) for table_name, table in mpfile.tdata[mp_cat_id].iteritems(): nb['cells'].append( nbf.new_markdown_cell("#### {}".format(table_name))) nb['cells'].append( nbf.new_code_cell( "mpfile.tdata[identifier]['{}']".format(table_name))) if mpfile.gdata[mp_cat_id]: nb['cells'].append(nbf.new_markdown_cell("### Graphical Data")) for plot_name, plot in mpfile.gdata[mp_cat_id].iteritems(): nb['cells'].append( nbf.new_markdown_cell("#### {}".format(plot_name))) nb['cells'].append( nbf.new_code_cell( "mpfile.gdata[identifier]['{}']".format(plot_name))) if mpfile.sdata[mp_cat_id]: nb['cells'].append(nbf.new_markdown_cell("### Structural Data")) for structure_name, structure in mpfile.sdata[mp_cat_id].iteritems(): nb['cells'].append( nbf.new_markdown_cell("#### {}".format(structure_name))) nb['cells'].append( nbf.new_code_cell( "mpfile.sdata[identifier]['{}']".format(structure_name))) self.ep.preprocess(nb, {'metadata': {'path': self.nbdir}}) if isinstance(self.db, dict): return [mp_cat_id, project, cid_short, export_notebook(nb, cid)] else: build_doc = RecursiveDict() build_doc['mp_cat_id'] = mp_cat_id build_doc['project'] = project build_doc['nb'] = nb self.curr_coll.update({'_id': cid}, {'$set': build_doc}, upsert=True) return '{}/{}'.format( # return URL for contribution page ('materials' if is_mp_id else 'compositions'), cid_str)