def create_correlation_test_docs(): """ Creates JSON file containing a certain number of materials and their necessary properties to load the propnet store for correlation tests. """ n_materials = 200 pnstore = MongograntStore("ro:mongodb03.nersc.gov/propnet", "propnet_july2019") pnstore.connect() cursor = pnstore.query(criteria={ '$and': [{ '$or': [{ p: { '$exists': True } }, { 'inputs.symbol_type': p }] } for p in PROPNET_PROPS] }, properties=['task_id', 'inputs'] + [p + '.mean' for p in PROPNET_PROPS] + [p + '.units' for p in PROPNET_PROPS] + [p + '.quantities' for p in PROPNET_PROPS]) data = [] for item in cursor: if len(data) < n_materials: data.append(item) else: cursor.close() break dumpfn(data, os.path.join(CORR_TEST_DIR, "correlation_propnet_data.json"))
def process_sdf_file(filename): mp_pubchem = MongograntStore("rw:knowhere.lbl.gov/mp_pubchem", "mp_pubchem", key="pubchem_id") mp_pubchem.connect() coll = mp_pubchem.collection skipped = 0 pubchem_molecules = [] for i, mol in enumerate(pybel.readfile("sdf", filename)): try: pubchem_id = int(mol.data["PUBCHEM_COMPOUND_CID"]) xyz = mol.write(format="xyz") data = {"pubchem_id": pubchem_id, "xyz": xyz} for key in keys: if key in mol.data: data[key_map[key]] = mol.data[key] pubchem_molecules.append(data) except KeyError: skipped += 1 coll.insert_many(pubchem_molecules) os.rename(filename, filename + ".processed") return len(pubchem_molecules), skipped
def create_test_docs(self): from maggma.advanced_stores import MongograntStore from monty.serialization import dumpfn pnstore = MongograntStore("ro:knowhere.lbl.gov/mp_core", "propnet") pnstore.connect() mpstore = MongograntStore("ro:knowhere.lbl.gov/mp_core", "materials") mpstore.connect() cursor = pnstore.query(criteria={ '$and': [{ '$or': [{ p: { '$exists': True } }, { 'inputs.symbol_type': p }] } for p in self.propnet_props] }, properties=['task_id']) pn_mpids = [item['task_id'] for item in cursor] cursor = mpstore.query( criteria={p: { '$exists': True } for p in self.mp_query_props}, properties=['task_id']) mp_mpids = [item['task_id'] for item in cursor] mpids = list(set(pn_mpids).intersection(set(mp_mpids)))[:200] pn_data = pnstore.query(criteria={'task_id': { '$in': mpids }}, properties=['task_id', 'inputs'] + [p + '.mean' for p in self.propnet_props] + [p + '.units' for p in self.propnet_props]) dumpfn(list(pn_data), os.path.join(TEST_DIR, "correlation_propnet_data.json")) mp_data = mpstore.query(criteria={'task_id': { '$in': mpids }}, properties=['task_id'] + self.mp_query_props) dumpfn(list(mp_data), os.path.join(TEST_DIR, "correlation_mp_data.json"))