def test_parse_reaxys_xml(self): parsed = self.parser.parse_reaxys_xml(self.filename) # Ensure that all proper reactions are present self.assertEqual(len(parsed), 3) indices = [x["meta"]["index"] for x in parsed] for index in [1, 9, 10]: self.assertTrue(index in indices) rxn_ids = [x["meta"]["rxn_id"] for x in parsed] for rxn_id in [28100547, 3553459, 8633298]: self.assertTrue(rxn_id in rxn_ids) # Ensure that all reactions are actually appropriate for rxn in parsed: pro_species = list() rct_species = list() for pro in rxn["pros"]: molecule = BabelMolAdaptor.from_string(pro, "mol") molecule.add_hydrogen() mol = molecule.pymatgen_mol pro_species += [str(site.specie) for site in mol] for rct in rxn["rcts"]: molecule = BabelMolAdaptor.from_string(rct, "mol") molecule.add_hydrogen() mol = molecule.pymatgen_mol rct_species += [str(site.specie) for site in mol] self.assertSequenceEqual(sorted(pro_species), sorted(rct_species)) # Test one reaction to make sure metadata is recorded correctly rxn = parsed[-1]["meta"] self.assertEqual(rxn["solvents"], {'dichloromethane'}) self.assertEqual(len(rxn["pro_meta"]), 1) self.assertEqual(len(rxn["rct_meta"]), 2) self.assertSequenceEqual( rxn["pro_meta"], [(5424566, '(4,5-dimethylcyclohexa-1,4-dienyl)trimethylsilane')]) self.assertSequenceEqual(rxn["rct_meta"], [(605285, '2,3-dimethyl-buta-1,3-diene'), (906752, 'trimethylsilylacetylene')])
def insert_g3testset(coll): for f in glob.glob("g*.txt"): print("Parsing " + f) for (m, charge, spin) in parse_file(f): try: clean_sites = [] for site in m: if Element.is_valid_symbol(site.specie.symbol): clean_sites.append(site) clean_mol = Molecule.from_sites(clean_sites, charge=charge, spin_multiplicity=spin) xyz = XYZ(clean_mol) bb = BabelMolAdaptor.from_string(str(xyz), "xyz") pbmol = pb.Molecule(bb.openbabel_mol) smiles = pbmol.write("smi").split()[0] can = pbmol.write("can").split()[0] inchi = pbmol.write("inchi") svg = pbmol.write("svg") d = {"molecule": clean_mol.as_dict()} comp = clean_mol.composition d["pretty_formula"] = comp.reduced_formula d["formula"] = comp.formula d["composition"] = comp.as_dict() d["elements"] = list(comp.as_dict().keys()) d["nelements"] = len(comp) d["charge"] = charge d["spin_multiplicity"] = spin d["smiles"] = smiles d["can"] = can d["inchi"] = inchi # d["names"] = get_nih_names(smiles) d["svg"] = svg d["xyz"] = str(xyz) d["tags"] = ["G305 test set"] coll.update( { "inchi": inchi, "charge": charge, "spin_multiplicity": spin }, {"$set": d}, upsert=True) except Exception as ex: print("Error in {}".format(f)) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) print("{} parsed!".format(f))
def insert_elements(coll): print("adding missing elements.") for z in range(1, 19): el = Element.from_Z(z) r = coll.find(filter={"formula": "{}1".format(el.symbol)}) if r.count() == 0: try: clean_mol = Molecule([el], [[0, 0, 0]]) xyz = XYZ(clean_mol) bb = BabelMolAdaptor.from_string(str(xyz), "xyz") pbmol = pb.Molecule(bb.openbabel_mol) smiles = pbmol.write("smi").split()[0] can = pbmol.write("can").split()[0] inchi = pbmol.write("inchi") svg = pbmol.write("svg") d = {"molecule": clean_mol.as_dict()} comp = clean_mol.composition d["pretty_formula"] = comp.reduced_formula d["formula"] = comp.formula d["composition"] = comp.as_dict() d["elements"] = list(comp.as_dict().keys()) d["nelements"] = len(comp) d["charge"] = 0 d["spin_multiplicity"] = clean_mol.spin_multiplicity d["smiles"] = smiles d["can"] = can d["inchi"] = inchi # d["names"] = get_nih_names(smiles) d["svg"] = svg d["xyz"] = str(xyz) d["tags"] = ["G305 test set"] coll.insert(d) except Exception as ex: print("Error in {}".format(el)) elif r.count() > 1: print("More than 1 {} found. Removing...".format(el)) results = list(r) for r in results[1:]: print(r["_id"]) coll.remove({"_id": r["_id"]})
def test_from_string(self): xyz = XYZ(self.mol) adaptor = BabelMolAdaptor.from_string(str(xyz), "xyz") mol = adaptor.pymatgen_mol self.assertEqual(mol.formula, "H4 C1")
def test_from_string(self): xyz = XYZ(self.mol) adaptor = BabelMolAdaptor.from_string(str(xyz), "xyz") mol = adaptor.pymatgen_mol self.assertEqual(mol.formula, "H4 C1")
def parse_reaxys_xml(self, filename): """ Parses an XML file generated by the Reaxys API. :param filename: str referring to XML file from Reaxys. :return: List of dicts including reactant CTAB, product CTAB, and metadata. """ results = [] filepath = os.path.join(self.base_dir, filename) with open(filepath, 'r') as fileobj: xml = fileobj.read() parsed = BeautifulSoup(xml, "lxml-xml") reactions = parsed.find_all("reaction") for reaction in reactions: # Screen for reactions with more than two reactants # or more than one product pros = reaction.find_all("RY.PRO") rcts = reaction.find_all("RY.RCT") if not ((len(pros) == 1 and len(rcts) == 2) or (len(pros) == 2 and len(rcts) == 1)): continue # Generate metadata from reaction header information # Will be passed along with CTAB information index = int(reaction["index"]) rxn_id = int(reaction.find("RX.ID").text) solvents = set( [sol.text for sol in reaction.find_all("RXD.SOL")]) rct_ids = reaction.find_all("RX.RXRN") rct_names = reaction.find_all("RX.RCT") rct_meta = [(int(e.text), rct_names[i].text) for i, e in enumerate(rct_ids)] pro_ids = reaction.find_all("RX.PXRN") pro_names = reaction.find_all("RX.PRO") pro_meta = [(int(e.text), pro_names[i].text) for i, e in enumerate(pro_ids)] meta = { "index": index, "rxn_id": rxn_id, "solvents": solvents, "rct_meta": sorted(rct_meta, key=lambda x: x[0]), "pro_meta": sorted(pro_meta, key=lambda x: x[0]) } # Capture reactant CTAB information # Make sure that ordering is the same for metadata and CTAB rcts = sorted(reaction.find_all("RY.RCT"), key=lambda x: int(x["rn"])) rcts = [rct.text for rct in rcts] pros = sorted(reaction.find_all("RY.PRO"), key=lambda x: int(x["rn"])) pros = [pro.text for pro in pros] rxn = {"rcts": rcts, "pros": pros, "meta": meta} try: pro_atoms = [] for pro in rxn["pros"]: ad = BabelMolAdaptor.from_string(pro, file_format="mol") ad.add_hydrogen() for site in ad.pymatgen_mol: pro_atoms.append(str(site.specie)) rct_atoms = [] for rct in rxn["rcts"]: ad = BabelMolAdaptor.from_string(rct, file_format="mol") ad.add_hydrogen() for site in ad.pymatgen_mol: rct_atoms.append(str(site.specie)) if sorted(pro_atoms) == sorted(rct_atoms): results.append(rxn) except: continue return results