示例#1
0
    def test_parse_reaxys_xml(self):

        parsed = self.parser.parse_reaxys_xml(self.filename)

        # Ensure that all proper reactions are present
        self.assertEqual(len(parsed), 3)

        indices = [x["meta"]["index"] for x in parsed]
        for index in [1, 9, 10]:
            self.assertTrue(index in indices)

        rxn_ids = [x["meta"]["rxn_id"] for x in parsed]
        for rxn_id in [28100547, 3553459, 8633298]:
            self.assertTrue(rxn_id in rxn_ids)

        # Ensure that all reactions are actually appropriate
        for rxn in parsed:
            pro_species = list()
            rct_species = list()

            for pro in rxn["pros"]:
                molecule = BabelMolAdaptor.from_string(pro, "mol")
                molecule.add_hydrogen()
                mol = molecule.pymatgen_mol
                pro_species += [str(site.specie) for site in mol]
            for rct in rxn["rcts"]:
                molecule = BabelMolAdaptor.from_string(rct, "mol")
                molecule.add_hydrogen()
                mol = molecule.pymatgen_mol
                rct_species += [str(site.specie) for site in mol]

            self.assertSequenceEqual(sorted(pro_species), sorted(rct_species))

        # Test one reaction to make sure metadata is recorded correctly
        rxn = parsed[-1]["meta"]
        self.assertEqual(rxn["solvents"], {'dichloromethane'})
        self.assertEqual(len(rxn["pro_meta"]), 1)
        self.assertEqual(len(rxn["rct_meta"]), 2)
        self.assertSequenceEqual(
            rxn["pro_meta"],
            [(5424566, '(4,5-dimethylcyclohexa-1,4-dienyl)trimethylsilane')])
        self.assertSequenceEqual(rxn["rct_meta"],
                                 [(605285, '2,3-dimethyl-buta-1,3-diene'),
                                  (906752, 'trimethylsilylacetylene')])
示例#2
0
def insert_g3testset(coll):
    for f in glob.glob("g*.txt"):
        print("Parsing " + f)
        for (m, charge, spin) in parse_file(f):
            try:
                clean_sites = []
                for site in m:
                    if Element.is_valid_symbol(site.specie.symbol):
                        clean_sites.append(site)
                clean_mol = Molecule.from_sites(clean_sites,
                                                charge=charge,
                                                spin_multiplicity=spin)
                xyz = XYZ(clean_mol)
                bb = BabelMolAdaptor.from_string(str(xyz), "xyz")
                pbmol = pb.Molecule(bb.openbabel_mol)
                smiles = pbmol.write("smi").split()[0]
                can = pbmol.write("can").split()[0]
                inchi = pbmol.write("inchi")
                svg = pbmol.write("svg")
                d = {"molecule": clean_mol.as_dict()}
                comp = clean_mol.composition
                d["pretty_formula"] = comp.reduced_formula
                d["formula"] = comp.formula
                d["composition"] = comp.as_dict()
                d["elements"] = list(comp.as_dict().keys())
                d["nelements"] = len(comp)
                d["charge"] = charge
                d["spin_multiplicity"] = spin
                d["smiles"] = smiles
                d["can"] = can
                d["inchi"] = inchi
                # d["names"] = get_nih_names(smiles)
                d["svg"] = svg
                d["xyz"] = str(xyz)
                d["tags"] = ["G305 test set"]
                coll.update(
                    {
                        "inchi": inchi,
                        "charge": charge,
                        "spin_multiplicity": spin
                    }, {"$set": d},
                    upsert=True)
            except Exception as ex:
                print("Error in {}".format(f))
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_exception(exc_type,
                                          exc_value,
                                          exc_traceback,
                                          limit=2,
                                          file=sys.stdout)
        print("{} parsed!".format(f))
示例#3
0
def insert_elements(coll):
    print("adding missing elements.")
    for z in range(1, 19):
        el = Element.from_Z(z)
        r = coll.find(filter={"formula": "{}1".format(el.symbol)})
        if r.count() == 0:
            try:
                clean_mol = Molecule([el], [[0, 0, 0]])
                xyz = XYZ(clean_mol)
                bb = BabelMolAdaptor.from_string(str(xyz), "xyz")
                pbmol = pb.Molecule(bb.openbabel_mol)
                smiles = pbmol.write("smi").split()[0]
                can = pbmol.write("can").split()[0]
                inchi = pbmol.write("inchi")
                svg = pbmol.write("svg")
                d = {"molecule": clean_mol.as_dict()}
                comp = clean_mol.composition
                d["pretty_formula"] = comp.reduced_formula
                d["formula"] = comp.formula
                d["composition"] = comp.as_dict()
                d["elements"] = list(comp.as_dict().keys())
                d["nelements"] = len(comp)
                d["charge"] = 0
                d["spin_multiplicity"] = clean_mol.spin_multiplicity
                d["smiles"] = smiles
                d["can"] = can
                d["inchi"] = inchi
                # d["names"] = get_nih_names(smiles)
                d["svg"] = svg
                d["xyz"] = str(xyz)
                d["tags"] = ["G305 test set"]
                coll.insert(d)
            except Exception as ex:
                print("Error in {}".format(el))
        elif r.count() > 1:
            print("More than 1 {} found. Removing...".format(el))
            results = list(r)
            for r in results[1:]:
                print(r["_id"])
                coll.remove({"_id": r["_id"]})
示例#4
0
 def test_from_string(self):
     xyz = XYZ(self.mol)
     adaptor = BabelMolAdaptor.from_string(str(xyz), "xyz")
     mol = adaptor.pymatgen_mol
     self.assertEqual(mol.formula, "H4 C1")
示例#5
0
 def test_from_string(self):
     xyz = XYZ(self.mol)
     adaptor = BabelMolAdaptor.from_string(str(xyz), "xyz")
     mol = adaptor.pymatgen_mol
     self.assertEqual(mol.formula, "H4 C1")
示例#6
0
    def parse_reaxys_xml(self, filename):
        """
        Parses an XML file generated by the Reaxys API.

        :param filename: str referring to XML file from Reaxys.
        :return: List of dicts including reactant CTAB, product CTAB, and
        metadata.
        """

        results = []

        filepath = os.path.join(self.base_dir, filename)

        with open(filepath, 'r') as fileobj:
            xml = fileobj.read()
            parsed = BeautifulSoup(xml, "lxml-xml")

            reactions = parsed.find_all("reaction")

            for reaction in reactions:
                # Screen for reactions with more than two reactants
                # or more than one product
                pros = reaction.find_all("RY.PRO")
                rcts = reaction.find_all("RY.RCT")
                if not ((len(pros) == 1 and len(rcts) == 2) or
                        (len(pros) == 2 and len(rcts) == 1)):
                    continue

                # Generate metadata from reaction header information
                # Will be passed along with CTAB information
                index = int(reaction["index"])
                rxn_id = int(reaction.find("RX.ID").text)
                solvents = set(
                    [sol.text for sol in reaction.find_all("RXD.SOL")])

                rct_ids = reaction.find_all("RX.RXRN")
                rct_names = reaction.find_all("RX.RCT")
                rct_meta = [(int(e.text), rct_names[i].text)
                            for i, e in enumerate(rct_ids)]

                pro_ids = reaction.find_all("RX.PXRN")
                pro_names = reaction.find_all("RX.PRO")
                pro_meta = [(int(e.text), pro_names[i].text)
                            for i, e in enumerate(pro_ids)]

                meta = {
                    "index": index,
                    "rxn_id": rxn_id,
                    "solvents": solvents,
                    "rct_meta": sorted(rct_meta, key=lambda x: x[0]),
                    "pro_meta": sorted(pro_meta, key=lambda x: x[0])
                }

                # Capture reactant CTAB information
                # Make sure that ordering is the same for metadata and CTAB
                rcts = sorted(reaction.find_all("RY.RCT"),
                              key=lambda x: int(x["rn"]))
                rcts = [rct.text for rct in rcts]

                pros = sorted(reaction.find_all("RY.PRO"),
                              key=lambda x: int(x["rn"]))

                pros = [pro.text for pro in pros]

                rxn = {"rcts": rcts, "pros": pros, "meta": meta}

                try:
                    pro_atoms = []
                    for pro in rxn["pros"]:
                        ad = BabelMolAdaptor.from_string(pro,
                                                         file_format="mol")
                        ad.add_hydrogen()
                        for site in ad.pymatgen_mol:
                            pro_atoms.append(str(site.specie))

                    rct_atoms = []
                    for rct in rxn["rcts"]:
                        ad = BabelMolAdaptor.from_string(rct,
                                                         file_format="mol")
                        ad.add_hydrogen()
                        for site in ad.pymatgen_mol:
                            rct_atoms.append(str(site.specie))

                    if sorted(pro_atoms) == sorted(rct_atoms):
                        results.append(rxn)
                except:
                    continue

        return results