def testSameCanSpiro(self): """Test several representations of the same spiro molecule.""" can = pybel.readstring("smi", "C1CN[C@]12CCCN2").write("can").split()[0] for smile in ['C1CN[C@]12CCCN2', 'C1CN[C@@]21CCCN2', 'C1CN[C@@]2(C1)CCN2']: mycan = pybel.readstring("smi", smile).write("can").split()[0] self.assertEqual(can, mycan, smile)
def testSquarePlanar(self): """Tighten up the parsing of SP stereochemistry in SMILES""" good = [ "C[S@SP1](Cl)(Br)I", "C[S@SP2](Cl)(Br)I", "C[S@SP3](Cl)(Br)I", ] bad = [ # raises error "C[S@SP0](Cl)(Br)I", "C[S@SP4](Cl)(Br)I", "C[S@@SP1](Cl)(Br)I", "C[S@SP11](Cl)(Br)I", "C[S@SO1](Cl)(Br)I", ] alsobad = [ # just a warning "C[S@SP1](Cl)(Br)(F)I", "C[S@SP1](Cl)(Br)(F)1CCCC1", ] for smi in good: mol = pybel.readstring("smi", smi) self.assertTrue(mol.OBMol.GetData(ob.StereoData)) for smi in bad: self.assertRaises(IOError, pybel.readstring, "smi", smi) for smi in alsobad: mol = pybel.readstring("smi", smi) self.assertTrue(mol.OBMol.GetData(ob.StereoData))
def canonicalize(lig, preserve_bond_order=False): """Get the canonical atom order for the ligand.""" atomorder = None # Get canonical atom order lig = pybel.ob.OBMol(lig.OBMol) if not preserve_bond_order: for bond in pybel.ob.OBMolBondIter(lig): if bond.GetBondOrder() != 1: bond.SetBondOrder(1) lig.DeleteData(pybel.ob.StereoData) lig = pybel.Molecule(lig) testcan = lig.write(format='can') try: pybel.readstring('can', testcan) reference = pybel.readstring('can', testcan) except IOError: testcan, reference = '', '' if testcan != '': reference.removeh() isomorphs = get_isomorphisms(reference, lig) # isomorphs now holds all isomorphisms within the molecule if not len(isomorphs) == 0: smi_dict = {} smi_to_can = isomorphs[0] for x in smi_to_can: smi_dict[int(x[1]) + 1] = int(x[0]) + 1 atomorder = [smi_dict[x + 1] for x in range(len(lig.atoms))] else: atomorder = None return atomorder
def testCan(self): can = self.mol.write("can").split()[0] smi = self.mol.write("smi").split()[0] can_fromsmi = pybel.readstring("smi", smi).write("can").split()[0] self.assertEqual(can, can_fromsmi) can_fromcan = pybel.readstring("smi", can).write("can").split()[0] self.assertEqual(can, can_fromcan)
def testReadingMassDifferenceInMolfiles(self): """Previously we were rounding incorrectly when reading the mass diff""" template = """ OpenBabel02181811152D 1 0 0 0 0 0 0 0 0 0999 V2000 0.0000 0.0000 0.0000 %2s %2d 0 0 0 0 0 0 0 0 0 0 0 M END """ # Positive test cases: # These are the BIOVIA Draw answers for the first 50 elements for # a mass diff of 1 answers = [2,5,8,10,12,13,15,17,20,21,24,25,28,29,32,33,36,41,40,41,46,49,52,53,56,57,60,60,65,66,71,74,76,80,81,85,86,89,90,92,94,97,99,102,104,107,109,113,116,120,123] for idx, answer in enumerate(answers): elem = idx + 1 molfile = template % (ob.GetSymbol(elem), 1) mol = pybel.readstring("mol", molfile).OBMol iso = mol.GetAtom(1).GetIsotope() self.assertEqual(answer, iso) # Also test D and T - BIOVIA Draw ignores the mass diff for elem, answer in zip("DT", [2, 3]): molfile = template % (elem, 1) mol = pybel.readstring("mol", molfile).OBMol iso = mol.GetAtom(1).GetIsotope() self.assertEqual(answer, iso) # Negative test cases: # Test error message for out-of-range values for value in [5, -4]: molfile = template % ("C", value) mol = pybel.readstring("mol", molfile).OBMol iso = mol.GetAtom(1).GetIsotope() self.assertEqual(0, iso)
def prediction(request): """ Form for submitting user calculations """ allreceptors = Receptor.objects.all() if 'submitdocking' in request.POST: form = SubmitDocking(request.POST) form.is_valid() smiles = str(form.cleaned_data['smiles']) name = form.cleaned_data['name'] error = [] try: pybel.readstring("smi", str(smiles)) except: error.append("Error in SMILES or compound molecular weight too big") if not error: uniquestring = ''.join(random.choice(string.ascii_lowercase) for x in range(10)) dockid = adddocking(uniquestring,smiles,name) return HttpResponseRedirect('/docking/%s/' % uniquestring) else: form = SubmitDocking() return render(request, 'prediction.html', {'form':form, 'error':error, 'allreceptors':allreceptors}) else: form = SubmitDocking() return render(request, 'prediction.html', {'form':form, 'allreceptors':allreceptors})
def _generate_conformers(self, input_sdf, n_conf=10, method="rmsd"): """Conformer generation. Given an input sdf string, call obabel to construct a specified number of conformers. """ import subprocess import pybel as pb import re if n_conf == 0: return [pb.readstring("sdf", input_sdf)] command_string = 'echo "%s" | obabel -i sdf -o sdf --conformer --nconf %d\ --score rmsd --writeconformers 2>&-' % (input_sdf, n_conf) sdf = subprocess.check_output(command_string, shell=True) # Clean the resulting output first_match = re.search('OpenBabel', sdf) clean_sdf = sdf[first_match.start():] # Accumulate molecules in a list mols = [] # Each molecule in the sdf output begins with the 'OpenBabel' string matches = list(re.finditer('OpenBabel', clean_sdf)) for i in range(len(matches) - 1): # The newline at the beginning is needed for obabel to # recognize the sdf format mols.append( pb.readstring("sdf", '\n' + clean_sdf[matches[i].start(): matches[i + 1].start()])) mols.append(pb.readstring("sdf", '\n' + clean_sdf[matches[-1].start():])) return mols
def testReadSmi(self): can = self.mol.write("can") smi = self.mol.write("smi") fromsmi = pybel.readstring("smi", smi) fromcan = pybel.readstring("smi", can) self.assertEqual(can, fromsmi.write("can")) self.assertEqual(can, fromcan.write("can"))
def print_results(results): t = PrettyTable(['smiles', 'predicted quality', 'logP', 'molwt']) [t.add_row([''.join(mol), predict, pybel.readstring('smi', ''.join(mol)).calcdesc(['logP'])['logP'], pybel.readstring("smi", ''.join(mol)).molwt]) for mol, predict in results[:5]] print t
def testOBMolSeparatePreservesAtomOrder(self): """Originally Separate() preserved DFS order rather than atom order""" # First test smi = "C123.F3.Cl2.Br1" mol = pybel.readstring("smi", smi) atomicnums = [atom.OBAtom.GetAtomicNum() for atom in mol] mols = mol.OBMol.Separate() new_atomicnums = [atom.OBAtom.GetAtomicNum() for atom in pybel.Molecule(mols[0])] for x, y in zip(atomicnums, new_atomicnums): self.assertEqual(x, y) # check that the atoms have not been permuted # Second test xyz = """6 examples/water_dimer.xyz O 0.12908 -0.26336 0.64798 H 0.89795 0.28805 0.85518 H 0.10833 -0.20468 -0.33302 O 0.31020 0.07569 -2.07524 H 0.64083 -0.57862 -2.71449 H -0.26065 0.64232 -2.62218 """ mol = pybel.readstring("xyz", xyz) mols = mol.OBMol.Separate() allatoms = pybel.Molecule(mols[0]).atoms + pybel.Molecule(mols[1]).atoms for idx, atom in enumerate(allatoms): xcoord = atom.OBAtom.GetX() orig_xcoord = mol.OBMol.GetAtom(idx+1).GetX() self.assertEqual(xcoord, orig_xcoord)
def testMOL(self): """Roundtrip thru MOL file""" smi = "C[CH3:6]" mol = pybel.readstring("smi", smi) molfile = mol.write("mol", opt={"a":True}) molb = pybel.readstring("mol", molfile) out = mol.write("smi", opt={"a":True, "n":True, "nonewline":True}) self.assertEqual(smi, out)
def testAtom4Refs(self): for mol in self.mols: can = mol.write("can") smi = mol.write("smi") can_fromsmi = pybel.readstring("smi", smi).write("can") self.assertEqual(can, can_fromsmi) can_fromcan = pybel.readstring("smi", can).write("can") self.assertEqual(can, can_fromcan)
def testSmilesParsingAndWritingOfLargeIsotopes(self): smis = ["[1C]", "[11C]", "[111C]", "[1111C]"] for smi in smis: mol = pybel.readstring("smi", smi) self.assertEqual(mol.write("smi").rstrip(), smi) self.assertRaises(IOError, pybel.readstring, "smi", "[11111C]") mol = pybel.readstring("smi", "[C]") mol.atoms[0].OBAtom.SetIsotope(65535) self.assertEqual(mol.write("smi").rstrip(), "[C]")
def testSettingSpinMult(self): """Set spin and read/write it""" mol = pybel.readstring("smi", "C") mol.atoms[0].OBAtom.SetSpinMultiplicity(2) molfile = mol.write("mol") self.assertEqual("M RAD 1 1 2", molfile.split("\n")[5]) molb = pybel.readstring("mol", molfile) self.assertEqual(2, molb.atoms[0].OBAtom.GetSpinMultiplicity()) self.assertEqual(4, molb.atoms[0].OBAtom.GetImplicitHCount())
def testRGroup(self): """[*:1] is converted to R1 in MOL file handling""" smi = "[*:6]C" mol = pybel.readstring("smi", smi) molfile = mol.write("mol") self.assertTrue("M RGP 1 1 6" in molfile) molb = pybel.readstring("mol", molfile) out = mol.write("smi", opt={"a":True, "n":True, "nonewline":True}) self.assertEqual(smi, out)
def testInChIIsotopes(self): """Ensure that we correctly set and read isotopes in InChIs""" with open(os.path.join(here, "inchi", "inchi_isotopes.txt")) as inp: for line in inp: if line.startswith("#"): continue smi, inchi = line.rstrip().split("\t") minchi = pybel.readstring("smi", smi).write("inchi").rstrip() self.assertEqual(minchi, inchi) msmi = pybel.readstring("inchi", minchi).write("smi").rstrip() self.assertEqual(msmi, smi)
def testAtomMapsAfterDeletion(self): """Removing atoms/hydrogens should not mess up the atom maps""" smis = ["C[NH2:2]", "[CH3:1][NH2:2]"] for smi in smis: mol = pybel.readstring("smi", smi) mol.OBMol.DeleteAtom(mol.OBMol.GetAtom(1)) self.assertEqual(mol.write("smi", opt={"a":True}).rstrip(), "[NH2:2]") smi = "[H]C[NH:2]" mol = pybel.readstring("smi", smi) mol.removeh() self.assertEqual(mol.write("smi", opt={"a":True}).rstrip(), "C[NH:2]")
def main(): if len(sys.argv) < 2: print "No input file provided: Murcko.py filetosprocess.ext" print "The script will determine which file type to read from by the extension." print "It is recommended you run your structures through,\nfor example, ChemAxon's Standardizer first." sys.exit(1) molnum = 0 Fragments = dict() for mol in pybel.readfile(sys.argv[1].split('.')[1], sys.argv[1]): molnum += 1 if not (molnum % 10): print "Molecules processed:", molnum #if molnum == 210: # break #print mol mol.OBMol.DeleteHydrogens() smiles = mol.write("smi").split("\t")[0] #print smiles #out.write(mol) #print "Number of rings:", len(mol.sssr) canmol = pybel.readstring("smi", smiles) FusedRingsMatrix = GetFusedRingsMatrix(canmol) FusedRings = GetFusedRings(FusedRingsMatrix, len(canmol.sssr)) #print FusedRings RingSystems = GetAtomsInRingSystems(canmol, FusedRings, inclexo=True) # Delete all non-ring atoms: this is now done in GetCanonicalFragments() #for ringnum in range(len(mol.sssr)): # mol = pybel.readstring("smi", smiles) # ratoms = list(mol.sssr[ringnum]._path) # #print "Atoms in ring:", sorted(ratoms, reverse=True) # #Delete complementary atoms # remove = list(set(range(1,len(mol.atoms)+1)).difference(set(ratoms))) # for a in sorted(remove, reverse=True): # mol.OBMol.DeleteAtom(mol.atoms[a-1].OBAtom) # #print mol # #out.write(mol) # Get all rings/ring systems frags = GetCanonicalFragments(smiles, RingSystems) for frag in frags: if frag in Fragments: Fragments[frag] += 1 else: Fragments[frag] = 1 # Write results to file print "Writing results to file." out = pybel.Outputfile("sdf", "fragments.sdf", overwrite=True) d = Fragments for k, v in sorted(d.items(), key=itemgetter(1), reverse=True): mol = pybel.readstring("smi", k) mol.data["COUNT"] = v mol.OBMol.DeleteHydrogens() out.write(mol) out.close()
def testCML(self): """OB stores atom classes using _NN at the end of atom ids""" smis = ["[CH3:6]C", "[CH3:6][OH:6]", "O"+"[CH2:2]"*27+"O" ] for smi in smis: mol = pybel.readstring("smi", smi) cml = mol.write("cml") molb = pybel.readstring("mol", cml) out = mol.write("smi", opt={"a":True, "n":True, "nonewline":True}) self.assertEqual(smi, out)
def testSmilesAtomOrder(self): """Ensure that SMILES atom order is written correctly""" data = [("CC", "1 2"), ("O=CCl", "3 2 1")] for smi, atomorder in data: mol = pybel.readstring("smi", smi) mol.write("can", opt={"O": True}) res = mol.data["SMILES Atom Order"] self.assertEqual(res, atomorder) mol = pybel.readstring("smi", "CC") mol.write("can") self.assertFalse("SMILES Atom Order" in mol.data)
def testFuzzingTestCases(self): """Ensure that fuzzing testcases do not cause crashes""" # rejected as invalid smiles smis = [r"\0", "&0", "=&", "[H][S][S][S@S00]0[S][S@S00H](0[S@S00][S])0n"] for smi in smis: self.assertRaises(IOError, pybel.readstring, "smi", smi) smis = ["c0C[C@H](B)00O0"] # warning and stereo ignored for smi in smis: pybel.readstring("smi", smi)
def testSmilesToMol(self): smis = ["C", "[CH3]", "[CH2]", "[CH2]C", "[C]"] valences = [0, 3, 2, 3, 15] for smi, valence in zip(smis, valences): mol = pybel.readstring("smi", smi) molfile = mol.write("mol") firstcarbon = molfile.split("\n")[4] mvalence = int(firstcarbon[48:53]) self.assertEqual(valence, mvalence) # test molfile->smiles msmi = pybel.readstring("mol", molfile).write("smi").rstrip() self.assertEqual(smi, msmi)
def testImplicitCisDblBond(self): """Ensure that dbl bonds in rings of size 8 or less are always implicitly cis""" smi = "C1/C=C/C" for i in range(5): # from size 4 to 8 ringsize = i + 4 ringsmi = smi + "1" roundtrip = pybel.readstring("smi", ringsmi).write("smi") self.assertTrue("/" not in roundtrip) smi += "C" ringsize = 9 ringsmi = smi + "1" roundtrip = pybel.readstring("smi", ringsmi).write("smi") self.assertTrue("/" in roundtrip)
def testSmiToSmi(self): # Should preserve stereo tet = "[C@@H](Br)(Br)Br" out = pybel.readstring("smi", tet).write("smi") self.assertTrue("@" in out) cistrans = r"C/C=C(\C)/C" out = pybel.readstring("smi", cistrans).write("smi") self.assertTrue("/" in out) # Should wipe stereo out = pybel.readstring("smi", tet, opt={"S": True}).write("smi") self.assertFalse("@" in out) cistrans = r"C/C=C(\C)/C" out = pybel.readstring("smi", cistrans, opt={"S": True}).write("smi") self.assertFalse("/" in out)
def __init__(self, smilesFrag): self.smiles = smilesFrag.replace(Break, Asterisk) self._molSmiles = self._removedAtom(self.smiles, Asterisk) self.mol = readstring('smi', self._molSmiles) self.atoms = len(self.mol.atoms) - self._molSmiles.count(WildCard) - self._molSmiles.count('H') self.smartsString = self._removedAtom(self._molSmiles, WildCard) self._smarts = Smarts(self.smartsString) if not self.match(self.mol) or len(Fragment._nh.findall(self.mol)) != self.smartsString.count(Fragment._nhString): self.smiles = smilesFrag.replace(Break, WildCard) self._molSmiles = self._removedAtom(self.smiles, Asterisk) self.mol = readstring('smi', self._molSmiles) self.cansmiles = Fragment._converter.getSmiles(self.mol) self._fingerprint = None self.target = None self._childs = set()
def calculate(ID, smiles): print "Calculating Features ..." mols = [pybel.readstring("smi", smile) for smile in smiles] fp2 = [mol.calcfp(fptype='fp2') for mol in mols] #1024 fp3 = [mol.calcfp(fptype='fp3') for mol in mols] #210 fp4 = [mol.calcfp(fptype='fp4') for mol in mols] #301 maccs = [mol.calcfp(fptype='maccs') for mol in mols] #166 print "Storing Features" features = [] for mol in range(len(mols)): feature = np.zeros(1024+210+301+166) for i in fp2[mol].bits: feature[i] = 1 for i in fp3[mol].bits: feature[i+1024] = 1 for i in fp4[mol].bits: feature[i+1024+210] = 1 for i in maccs[mol].bits: feature[i+1024+301] = 1 features.append(feature) pack = [] for i in range(len(smiles)): pack.append((ID[i],smiles[i],features[i],0)) # print pack[-1] print "Saving into file..." f = open('openbabel_rdkit_test.csv','a') for r in pack: # print ','.join([str(i) for i in r[3]]) #print "%s,%s,%s\n" % (r[0], ','.join([str(i) for i in r[1]]),r[2]) tmp = "%s,%s,%s,%s\n" % (r[0], r[1], ','.join([str(i) for i in r[2]]),r[3]) # print tmp f.write(tmp) f.close()
def fromCML(self, cmlstr): """ Convert a string of CML `cmlstr` to a Structure object. """ cmlstr = cmlstr.replace('\t', '') mol = pybel.readstring('cml', cmlstr) self.fromOBMol(mol.OBMol)
def pocketSection(self): cleaned = self.__cleanedPdb() prt = pybel.readstring("pdb", cleaned) if type(self.lig_path) is str and os.path.exists(self.lig_path): suffix = self.lig_path.split('.')[-1] lig = pybel.readfile(suffix, self.lig_path).next() elif type(self.lig_path) is pybel.Molecule: lig = self.lig_path else: raise Exception("Wrong input for ligand") pkt_lines = [] residues = set() for line, atom in zip(cleaned.split("\n")[:-1], prt.atoms): coords = atom.coords dists = [euclidean(coords, a.coords) for a in lig.atoms] if any([d < self.threshold for d in dists]): pkt_lines.append(line) res_num = int(line[22:26]) residues.add(res_num) if self.title == "": start_pkt_line = "\nPKT %d 1000 %s\n" % (len(residues), lig.title.split('/')[-1]) else: start_pkt_line = "\nPKT %d 1000 %s\n" % (len(residues), self.title) return start_pkt_line + "\n".join(pkt_lines) + "\nTER\n"
def convert(data, in_format, out_format, pretty=True, add_h=False): """Converts between two inputted chemical formats.""" # Decide on a json formatter depending on desired prettiness dumps = json.dumps if pretty else json.compress # Not doing this can cause segfaults in the underlying openbabel C++ if not IS_PY3: in_format.encode("ascii") out_format.encode("ascii") data.encode("ascii", "replace") # If it's a json string, load it. NOTE: This is a custom chemical format if in_format == "json" and isinstance(data, str if IS_PY3 else basestring): data = json.loads(data) # These use the open babel library to interconvert, with additions for json mol = (json_to_pybel(data) if in_format == "json" else pybel.readstring(in_format, data)) # Infer structure in cases where the input format has no specification # or the specified structure is small if not mol.OBMol.HasNonZeroCoords() or len(mol.atoms) < 50: mol.make3D(steps=500) mol.OBMol.Center() if add_h: mol.addh() return (dumps(pybel_to_json(mol)) if out_format == "json" else mol.write(out_format))
def write_input_file(par,name,file_name): f = open(file_name,'w+') f.write('title = \'%s\'\n\n'%name) f.write('method = \'%s\'\n'%par.method) f.write('basis = \'%s\'\n'%par.basis) f.write('qc = \'%s\'\n'%par.qc) f.write('conformer_search = %i\n'%par.conformer_search) f.write('reaction_search = %i\n'%par.reaction_search) f.write('barrier_threshold = %.1f\n'%par.barrier_threshold) f.write('families = [%s]\n'%','.join(["'%s'"%fi for fi in par.jobs[name][1]])) f.write('ga = %i\n'%par.ga) f.write('ngen = %i\n'%par.ngen) f.write('ppn = %i\n\n'%par.ppn) smi = par.jobs[name][0] obmol = pybel.readstring('smi',smi) obmol.OBMol.AddHydrogens() charge = 0 f.write('charge = %i\n'%charge) mult = obmol.spin f.write('mult = %i\n'%mult) natom = len(obmol.atoms) f.write('natom = %i\n'%natom) f.write('smiles = \'%s\'\n'%smi) if name in par.structures: f.write('structure = %s\n\n'%par.structures[name])
(i, j, k, l)) dbfile = [] fin = open( inpath + '/Neutral' + '/%s/%s/%s/%s.st' % (i, j, k, l), 'r') for s in fin: ss = s.replace('\n', '').split('\t') dbfile.append(ss) fin.close() totalcount += len(dbfile) print('Reading complete') for s in dbfile: smiles = s[3] idx = s[4] fullidx = 'Neutral_%s_%s_%s_%s_%s' % (i, j, k, l, idx) mol = pybel.readstring('smi', smiles) mol.addh() model = pt.PybelModel_To_Fragmenter(mol) subcount += 1 if subcount > 10000: subcount = 0 fout.close() subindex += 1 fout = open( outpath + '/fraginput_%s.txt' % (subindex), 'w') #fout=open(outpath+'/%s/fraginput_%s.txt'%(batch,subindex),'w'); fout.write('%s\n' % fullidx) print(fullidx) fout.write( '%s,%s,%s,%s,%s' %
def select_molecules(sdf_dir, out_dir, begin=0, end=1000_000): ''' Filter molecules in sdf.gz files by charge, n_heavy, element, components Then write the selected molecules in sdf files :param sdf_dir: :param out_dir: :param begin: :param end: :return: ''' sdf_list = list( filter(lambda x: x.endswith('.sdf.gz'), os.listdir(sdf_dir))) print(len(sdf_list)) for i, sdf in enumerate(sdf_list[begin:end]): sys.stdout.write('\r\t%i / %i' % (i + begin, len(sdf_list))) sys.stdout.flush() sdf_out = pybel.Outputfile( 'sdf', os.path.join(out_dir, 'CHONFClBr-%04i.sdf' % (i + 1))) for m in pybel.readfile('sdf', os.path.join(sdf_dir, sdf), opt={'P': None}): try: cid = int(m.data['PUBCHEM_COMPOUND_CID']) formula = m.data['PUBCHEM_MOLECULAR_FORMULA'] name = m.data['PUBCHEM_IUPAC_NAME'] smiles = m.data['PUBCHEM_OPENEYE_ISO_SMILES'] inchi = m.data['PUBCHEM_IUPAC_INCHI'] cactvs = m.data['PUBCHEM_CACTVS_SUBSKEYS'] # base64 encoded weight = float(m.data['PUBCHEM_MOLECULAR_WEIGHT']) charge = int(m.data['PUBCHEM_TOTAL_CHARGE']) n_heavy = int(m.data['PUBCHEM_HEAVY_ATOM_COUNT']) except: continue # Ignore ion if charge != 0: continue f = Formula(formula) # Ignore large molecule if f.n_heavy > 19: continue # Limit element atom_set = set(f.atomdict.keys()) if 'C' not in atom_set or atom_set & {'H', 'F', 'Cl', 'Br'} == set() or \ not atom_set <= {'C', 'H', 'O', 'N', 'F', 'Cl', 'Br'}: continue # Kick out mixture if smiles.find('.') > -1: continue mol = pybel.readstring('smi', smiles) if mol.formula != formula or mol.charge != charge: print('SMILES formula Error:', cid) continue sdf_out.write(m) continue sdf_out.close()
def testReadingBenzyne(self): """Check that benzyne is read correctly""" smi = "c1cccc#c1" mol = pybel.readstring("smi", smi) self.assertEqual("C1=CC=CC#C1", mol.write("smi").rstrip())
def convert_inchi_to_formula(inchi_string): """ Converts InChI to formula. Depends on/uses openbabel. """ # We always cast strings because some applications return unicode (such as Django model fields) return pybel.readstring('inchi', str(inchi_string)).formula
# reading input file with the desription of the desired new fragment fname = args.infile print "Reading description of the new fragment from the file ", fname fp = open(fname, "r") fragname = fp.readline().strip() corestring = fp.readline().strip() fragstring = [] s = fp.readline() while s: if (len(s.strip()) > 0): fragstring.append(s.strip()) s = fp.readline() fp.close() core = pybel.readstring("smi", corestring) coreSMARTS = pybel.Smarts(corestring) # checking for self-consistency for f in fragstring: fmol = pybel.readstring("smi", f) res = coreSMARTS.findall(fmol) if (len(res) == 0): sys.exit("ERROR: cannot find core " + corestring + " in the structure " + f) for i in xrange(1, fmol.OBMol.NumAtoms() + 1): atm = fmol.OBMol.GetAtom(i) if (i not in res[0]) and (atm.IsHydrogen() == False): bonded = 0 for at in openbabel.OBAtomAtomIter(fmol.OBMol.GetAtom(i)): if at.GetIndex() + 1 in res[0]:
def test_make3d(self): mol_0d = pb.readstring("smi", "CCCC").OBMol adaptor = BabelMolAdaptor(mol_0d) adaptor.make3d() self.assertEqual(mol_0d.GetDimension(), 3)
Ac_CollisionEnergyRecord, MS_FocusedIon, Ac_IonType in cursor: print(EntryID); spectrum=MSSpectrum(); if Ac_MassSpecIonMode=='P': spectrum.parameters['mode']=1; else: spectrum.parameters['mode']=-1; if Ac_MassSpecType=='MS' or Ac_MassSpecType=='MS1': spectrum.parameters['level']=1; elif Ac_MassSpecType=='MS2': spectrum.parameters['level']=2; elif Ac_MassSpecType=='MS3': spectrum.parameters['level']=3; elif Ac_MassSpecType=='MS4': spectrum.parameters['level']=4; mol=pybel.readstring('smi',str(Ch_SMILES)); mol.addh(); charge=mol.charge; Ch_ExactMass=mol.exactmass; spectrum.parameters['dbsource']=DB_Source; spectrum.parameters['formula']=Ch_Formula; spectrum.parameters['exactmass']=Ch_ExactMass; spectrum.parameters['charge']=charge; spectrum.parameters['smiles']=Ch_SMILES; spectrum.parameters['inchi']=Ch_InChi[3:]; sinchi=Ch_InChi[3:].split('/'); shortinchi=sinchi[0]; for j in range(1,len(sinchi)): if sinchi[j][0]=='c' or sinchi[j][0]=='h': shortinchi+='/'+sinchi[j];
def extractdata(folder): smiles = [ x.rstrip() for x in open(os.path.join(folder, os.path.basename(folder) + ".txt"), "r").readlines() ] print smiles archivefile = open(os.path.join(folder, "zindo.txt"), "w") print >> archivefile, "\t".join([ "File ID", "SMILES", "H**O (eV)", "LUMO (eV)", "Trans (eV)", "Osc", "..." ]) #getnum = lambda x: int(x.split("/")[1].split(".")[0]) getnum = lambda x: int(x.split("/")[7].split(".")[0]) homos = [] lumos = [] trans = [] convert = 1.0 / utils.convertor(1, "eV", "cm-1") #for filename in sorted(glob.glob("*.gz"), key=getnum): for filename in sorted(glob.glob(os.path.join(folder, "*.gz")), key=getnum): number = getnum(filename) smile = smiles[number] text = gzip.open(filename, "r").read() if text.find("Excitation energies and oscillator strength") < 0: continue lines = iter(text.split("\n")) for line in lines: if line.startswith(" Initial command"): break text = StringIO.StringIO("\n".join(list(lines))) logfile = ccopen(text) #logfile.logger.setLevel(logging.ERROR) data = logfile.parse() assert (len(data.homos) == 1) smiles.append(smile) h**o = data.homos[0] homos.append(data.moenergies[0][h**o]) lumos.append(data.moenergies[0][h**o + 1]) trans.append(zip(data.etenergies, data.etoscs)) archivefile.write("%d\t%s\t%f\t%f" % (number, smile, homos[-1], lumos[-1])) for x in trans[-1]: archivefile.write("\t%f\t%f" % (x[0] * convert, x[1])) archivefile.write("\n") ## print >> open("tmp.txt", "w"), text.getvalue() if smile != "c(s1)c(SN=N2)c2c1c(s1)c(SN=N2)c2c1c(s1)c(SN=N2)c2c1c(s1)c(SN=N2)c2c1c(s1)c(SN=N2)c2c1c(s1)c(SN=N2)c2c1": mol = pybel.readstring('g09', text.getvalue()) mol.write("xyz", os.path.join(folder, "%d.xyz" % number), overwrite=True) print "%s: Created zindo.txt, plus various xyz files." % folder
def draw_molecule(self, context, center=(0, 0, 0), show_bonds=True, join=True): smile_text = context.scene.molecule.smile_format molecule = pybel.readstring("smi", smile_text) molecule.make3D() shapes = [] bpy.ops.mesh.primitive_uv_sphere_add() sphere = bpy.context.object # Initialize bond material if it's going to be used. if show_bonds: bpy.data.materials.new(name='bond') bpy.data.materials['bond'].diffuse_color = atom_data['bond'][ 'color'] bpy.data.materials['bond'].specular_intensity = 0.2 bpy.ops.mesh.primitive_cylinder_add() cylinder = bpy.context.object cylinder.active_material = bpy.data.materials['bond'] for atom in molecule.atoms: element = atom.type if element not in atom_data: element = 'undefined' if element not in bpy.data.materials: key = element bpy.data.materials.new(name=key) bpy.data.materials[key].diffuse_color = atom_data[key]['color'] bpy.data.materials[key].specular_intensity = 0.2 atom_sphere = sphere.copy() atom_sphere.data = sphere.data.copy() atom_sphere.location = [l + c for l, c in zip(atom.coords, center)] scale = 1 if show_bonds else 2.5 atom_sphere.dimensions = [ atom_data[element]['radius'] * scale * 2 ] * 3 atom_sphere.active_material = bpy.data.materials[element] bpy.context.scene.collection.objects.link(atom_sphere) shapes.append(atom_sphere) for bond in (openbabel.OBMolBondIter(molecule.OBMol) if show_bonds else []): start = molecule.atoms[bond.GetBeginAtom().GetIndex()].coords end = molecule.atoms[bond.GetEndAtom().GetIndex()].coords diff = [c2 - c1 for c2, c1 in zip(start, end)] cent = [(c2 + c1) / 2 for c2, c1 in zip(start, end)] mag = sum([(c2 - c1)**2 for c1, c2 in zip(start, end)])**0.5 v_axis = Vector(diff).normalized() v_obj = Vector((0, 0, 1)) v_rot = v_obj.cross(v_axis) # This check prevents gimbal lock (ie. weird behavior when v_axis is # close to (0, 0, 1)) if v_rot.length > 0.01: v_rot = v_rot.normalized() axis_angle = [acos(v_obj.dot(v_axis))] + list(v_rot) else: v_rot = Vector((1, 0, 0)) axis_angle = [0] * 4 order = bond.GetBondOrder() if order not in range(1, 4): sys.stderr.write( "Improper number of bonds! Defaulting to 1.\n") bond.GetBondOrder = 1 if order == 1: trans = [[0] * 3] elif order == 2: trans = [[ 1.4 * atom_data['bond']['radius'] * x for x in v_rot ], [-1.4 * atom_data['bond']['radius'] * x for x in v_rot]] elif order == 3: trans = [ [0] * 3, [2.2 * atom_data['bond']['radius'] * x for x in v_rot], [-2.2 * atom_data['bond']['radius'] * x for x in v_rot] ] for i in range(order): bond_cylinder = cylinder.copy() bond_cylinder.data = cylinder.data.copy() bond_cylinder.dimensions = [ atom_data['bond']['radius'] * scale * 2 ] * 2 + [mag] bond_cylinder.location = [ c + scale * v for c, v in zip(cent, trans[i]) ] bond_cylinder.rotation_mode = 'AXIS_ANGLE' bond_cylinder.rotation_axis_angle = axis_angle bpy.context.scene.collection.objects.link(bond_cylinder) shapes.append(bond_cylinder) sphere.select_set(True) if show_bonds: cylinder.select_set(True) bpy.ops.object.delete() for shape in shapes: shape.select_set(True) bpy.context.view_layer.objects.active = shapes[0] bpy.ops.object.shade_smooth() if join: bpy.ops.object.join() bpy.ops.object.origin_set(type='ORIGIN_GEOMETRY', center='MEDIAN') bpy.context.scene.update() obj = bpy.context.selected_objects obj[0].name = smile_text obj[0].location = bpy.context.scene.cursor_location return {'FINISHED'}
source = "MetaCyc" if (re.search("^[CR]\d{5}$", external_id)): source = "KEGG" for struct_stage in sorted( Structures_Dict[struct_type][external_id].keys()): file_string = "_".join((source, struct_type, struct_stage)) for structure in sorted(Structures_Dict[struct_type][external_id] [struct_stage].keys()): mol = None mol_source = "" try: if (struct_type == 'InChI'): mol = AllChem.MolFromInchi(structure) if (mol is None or external_id == 'FAD'): mol = pybel.readstring("inchi", structure) if (mol): mol_source = "OpenBabel" else: mol_source = "RDKit" elif (struct_type == 'SMILE'): mol = AllChem.MolFromSmiles(structure) if (mol == None): mol = pybel.readstring("smiles", structure) if (mol): mol_source = "OpenBabel" else: mol_source = "RDKit" except Exception as e: pass
def adddocking(uniquestring, smiles, molname): molname = molname.decode("windows-1252").encode('utf-8', 'ignore') try: mol = pybel.readstring("smi", str(smiles)) except IOError: status = "Something went wrong.." dock = Docking(uniquestring=uniquestring, smiles=smiles, molname=molname, status=status) dock.save() return "Error" if mol.molwt > 800: # Prevent people from docking too big compounds status = "Molecular weight too big, calculation aborted.." dock = Docking(uniquestring=uniquestring, smiles=smiles, molname=molname, status=status) dock.save() return "Error" mol.OBMol.AddHydrogens(True, True, 7.4) smiles = mol.write(format='smi') descs = mol.calcdesc() #generate 2D coordinates, needs openbabel obConversion = openbabel.OBConversion() obConversion.SetInAndOutFormats("smi", "mdl") obmol = openbabel.OBMol() obConversion.ReadString(obmol, smiles) gen2d = openbabel.OBOp.FindType("gen2d") gen2d.Do(obmol) MDL = obConversion.WriteString(obmol) molfile = MDL.replace("\n", r"\n") CMW = descs["MW"] HBA = descs["HBA1"] HBD = descs["HBD"] logP = descs["logP"] tpsa = descs["TPSA"] #Get number of rotatable bonds smarts = pybel.Smarts( r"[!$([NH]!@C(=O))&!D1&!$(*#*)]\&!@[!$([NH]!@C(=O))&!D1&!$(*#*)]") rb = smarts.findall(mol) nrb = len(rb) #Get fingerprint and molecular complexity if detect_pains(mol) == "": pains = "Not found" else: pains = detect_pains(mol) status = "Calculating..." results = "" dock = Docking(uniquestring=uniquestring, smiles=smiles, molname=molname, molfile=molfile, CMW=CMW, HBA=HBA, HBD=HBD, logP=logP, tpsa=tpsa, nrb=nrb, pains=pains, status=status, results=results) dock.save() dockingseq.delay(dock) return dock.id
def localopt(mol_with_cat_mopin, _): """Optimize using uff built in pybel.""" pymol = pybel.readstring('mopin', mol_with_cat_mopin) pymol.localopt('uff') return pymol
def run(): """This method is run by typing `blender-chemicals` into a terminal.""" parser = argparse.ArgumentParser(description="Imports chemicals into " "Blender with Open Babel.") parser.add_argument('input', help="The file or smiles string to draw.") parser.add_argument('--format', type=str, default='auto', help="The " "chemical format of the input file. Defaults to " "'auto', which uses the file extension.") parser.add_argument('--convert-only', action='store_true', help="Converts " "the input into a simplified JSON format and prints " "to stdout. Does not draw.") parser.add_argument('--space-filling', action='store_true', help="Draws " "a space-filling (instead of ball-and-stick) " "representation.") parser.add_argument('--no-join', dest='join', action='store_false', help="Skips joining the atoms/bonds into a single " "mesh. Use if you want to individually edit atoms in " "Blender, but note it will impair performance.") parser.add_argument('--no-hydrogens', dest='hydrogens', action='store_false', help="Avoids drawing hydrogens.") parser.add_argument('--no-generate-coords', dest='generate_coords', action='store_false', help="Skips generating 3D " "coordinates.") parser.add_argument('--no-infer-bonds', dest='infer_bonds', action='store_false', help="Skips inferring bonds.") args = parser.parse_args() try: with open(args.input) as in_file: data = in_file.read() is_file = True except IOError: data = args.input is_file = False if args.format == 'auto': chemformat = os.path.splitext(args.input)[1][1:] if is_file else 'smi' else: chemformat = args.format if not pybel.informats: sys.stderr.write("Open babel not properly installed. Exiting.\n") sys.exit() if chemformat not in pybel.informats: prefix = "Inferred" if args.format == 'auto' else "Supplied" formats = ', '.join(pybel.informats.keys()) sys.stderr.write( ("{} format '{}' not in available open babel formats." "\n\nSupported formats:\n{}\n").format(prefix, chemformat, formats)) sys.exit() try: mol = pybel.readstring(chemformat, data) except OSError: prefix = "Inferred" if args.format == 'auto' else "Supplied" debug = ((" - Read input as file." if is_file else " - Inferred input as string, not file.") + "\n - {} format of '{}'.".format(prefix, chemformat)) sys.stderr.write("Could not read molecule.\n\nDebug:\n" + debug + "\n") sys.exit() json_mol = process(mol, args.hydrogens, args.generate_coords, args.infer_bonds, args.convert_only) if args.convert_only: print(json_mol) sys.exit() mac_path = '/Applications/blender.app/Contents/MacOS/./blender' if shutil.which('blender') is not None: blender = 'blender' elif os.path.isfile(mac_path): blender = mac_path else: sys.stderr.write("Could not find installed copy of Blender. Either " "make sure it's on your path or copy the contents of " "`drawer.py` into a running blender instance.\n") sys.exit() root = os.path.normpath(os.path.dirname(__file__)) script = os.path.join(root, 'draw.py') command = [blender, '--python', script, '--', json_mol] if args.space_filling: command.append('--space-filling') if not args.join: command.append('--no-join') with open(os.devnull, 'w') as null: subprocess.Popen(command, stdout=null, stderr=null)
def add_hydrogen(self): mol_0d = pb.readstring("smi", "CCCC").OBMol self.assertEqual(len(pb.Molecule(mol_0d).atoms), 2) adaptor = BabelMolAdaptor(mol_0d) adaptor.add_hydrogen() self.assertEqual(len(adaptor.pymatgen_mol.sites), 14)
def eqn_interr(num_eqn, naked_list_eqn, rindx, rstoi, pindx, pstoi, chem_scheme_markers, reac_coef, spec_namelist, spec_name, spec_smil, spec_list, Pybel_objects, nreac, nprod, comp_num, phase): # inputs: ---------------------------------------------------------------------------- # num_eqn - number of equations (scalar) # naked_list_eqn - equations in strings # rindx - to hold indices of reactants # rstoi - to hold stoichiometries of reactants # pindx - to hold indices of products # pstoi - to hold stoichiometries of products # chem_scheme_markers - markers for separating sections of the chemical scheme # reac_coef - to hold reaction rate coefficients # spec_namelist - name strings of components present in the scheme (not SMILES) # spec_name - name string of components in xml file (not SMILES) # spec_smil - SMILES from xml file # spec_list - SMILES of components present in scheme # Pybel_objects - list containing pybel objects # nreac - to hold number of reactions per equation # nprod - number of products per equation # comp_num - number of unique components in reactions across all phases # phase - marker for the phase being considered: 0 for gas, 1 for particulates # ------------------------------------------------------------------------------------ max_no_reac = 0.0 # log maximum number of reactants in a reaction max_no_prod = 0.0 # log maximum number of products in a reaction # Loop through equations line by line and extract the required information for eqn_step in range(num_eqn): line = naked_list_eqn[eqn_step] # extract this line # work out whether equation or reaction rate coefficient part comes first eqn_start = str('.*\\' + chem_scheme_markers[10]) rrc_start = str('.*\\' + chem_scheme_markers[9]) # get index of these markers, note span is the property of the match object that # gives the location of the marker eqn_start_indx = (re.match(eqn_start, line)).span()[1] rrc_start_indx = (re.match(rrc_start, line)).span()[1] if eqn_start_indx > rrc_start_indx: eqn_sec = 1 # equation is second part else: eqn_sec = 0 # equation is first part # split the line into 2 parts: equation and rate coefficient # . means match with anything except a new line character., when followed by a * # means match zero or more times (so now we match with all characters in the line # except for new line characters, so final part is stating the character(s) we # are specifically looking for, \\ ensures the marker is recognised if eqn_sec == 1: eqn_markers = str('\\' + chem_scheme_markers[10] + '.*\\' + chem_scheme_markers[11]) else: # end of equation part is start of reaction rate coefficient part eqn_markers = str('\\' + chem_scheme_markers[10] + '.*\\' + chem_scheme_markers[9]) # extract the equation as a string ([0] extracts the equation section and # [1:-1] removes the bounding markers) eqn = re.findall(eqn_markers, line)[0][1:-1].strip() eqn_split = eqn.split() eqmark_pos = eqn_split.index('=') # with stoich number; rule out the photon reactants = [ i for i in eqn_split[:eqmark_pos] if i != '+' and i != 'hv' ] products = [t for t in eqn_split[eqmark_pos + 1:] if t != '+'] # with stoich number # record maximum number of reactants across all equations max_no_reac = np.maximum(len(reactants), max_no_reac) # record maximum number of products across all equations max_no_prod = np.maximum(len(products), max_no_prod) # append columns if needed while max_no_reac > np.minimum(rindx.shape[1], rstoi.shape[1]): rindx = np.append(rindx, (np.zeros((num_eqn, 1))).astype(int), axis=1) rstoi = np.append(rstoi, (np.zeros((num_eqn, 1))), axis=1) while max_no_prod > np.minimum(pindx.shape[1], pstoi.shape[1]): pindx = np.append(pindx, (np.zeros((num_eqn, 1))).astype(int), axis=1) pstoi = np.append(pstoi, (np.zeros((num_eqn, 1))), axis=1) # .* means occurs anywhere in line and, first \ means second \ can be interpreted # and second \ ensures recognition of marker rate_coeff_start_mark = str('\\' + chem_scheme_markers[9]) # . means match with anything except a new line character, when followed by a * # means match zero or more times (so now we match with all characters in the line # except for new line characters, \\ ensures the marker # is recognised if eqn_sec == 1: # end of reaction rate coefficient part is start of equation part rate_coeff_end_mark = str('.*\\' + chem_scheme_markers[10]) else: # end of reaction rate coefficient part is end of line rate_coeff_end_mark = str('.*\\' + chem_scheme_markers[11]) # rate coefficient starts and end punctuation rate_regex = str(rate_coeff_start_mark + rate_coeff_end_mark) # rate coefficient expression in a string rate_ex = re.findall(rate_regex, line)[0][1:-1].strip() # convert fortran-type scientific notation to python type rate_ex = formatting.SN_conversion(rate_ex) # convert the rate coefficient expressions into Python readable commands rate_ex = formatting.convert_rate_mcm(rate_ex) if (rate_ex.find('EXP') != -1): print(rate_ex) sys.exit() # store the reaction rate coefficient for this equation # (/s once any inputs applied) reac_coef.append(rate_ex) # extract the stoichiometric number of the specii in current equation reactant_step = 0 product_step = 0 stoich_regex = r"^\d*\.\d*|^\d*" numr = len(reactants) # number of reactants in this equation # left hand side of equations (losses) for reactant in reactants: if (re.findall(stoich_regex, reactant)[0] != ''): stoich_num = float(re.findall(stoich_regex, reactant)[0]) # name with no stoich number name_only = re.sub(stoich_regex, '', reactant) elif (re.findall(stoich_regex, reactant)[0] == ''): stoich_num = 1.0 name_only = reactant # store stoichometry rstoi[eqn_step, reactant_step] = stoich_num if name_only not in spec_namelist: # if new component encountered spec_namelist.append( name_only) # add to chemical scheme name list # convert MCM chemical names to SMILES if name_only in spec_name: # index where xml file name matches reaction component name name_indx = spec_name.index(name_only) name_SMILE = spec_smil[name_indx] # SMILES of component else: sys.exit( str('Error: inside eqn_parser, chemical scheme name ' + str(name_only) + ' not found in xml file')) spec_list.append(name_SMILE) # list SMILE names name_indx = comp_num # allocate index to this species # Generate pybel Pybel_object = pybel.readstring('smi', name_SMILE) # append to Pybel object list Pybel_objects.append(Pybel_object) comp_num += 1 # number of unique species else: # if it's a species already encountered it will be in spec_list # existing index name_indx = spec_namelist.index(name_only) # store reactant index # check if index already present - i.e. component appears more than once if sum(rindx[eqn_step, 0:reactant_step] == int(name_indx)) > 0: # get pre-existing index of this component exist_indx = np.where( rindx[eqn_step, 0:reactant_step] == (int(name_indx))) # add to pre-existing stoichiometry rstoi[eqn_step, exist_indx] += rstoi[eqn_step, reactant_step] rstoi[eqn_step, reactant_step] = 0 # remove stoichiometry added above reactant_step -= 1 # ignore this duplicate product else: rindx[eqn_step, reactant_step] = int(name_indx) reactant_step += 1 # number of reactants in this equation nreac[eqn_step] = int(reactant_step) # right hand side of equations (gains) for product in products: if (re.findall(stoich_regex, product)[0] != ''): stoich_num = float(re.findall(stoich_regex, product)[0]) name_only = re.sub(stoich_regex, '', product) # name with no stoich number elif (re.findall(stoich_regex, product)[0] == ''): stoich_num = 1.0 name_only = product # store stoichometry pstoi[eqn_step, product_step] = stoich_num if name_only not in spec_namelist: # if new component encountered spec_namelist.append(name_only) # convert MCM chemical names to SMILES # index where xml file name matches reaction component name if name_only in spec_name: name_indx = spec_name.index(name_only) name_SMILE = spec_smil[name_indx] else: sys.exit( str('Error: inside eqn_parser, chemical scheme name ' + str(name_only) + ' not found in xml file')) spec_list.append( name_SMILE) # list SMILE string of parsed species name_indx = comp_num # allocate index to this species # Generate pybel Pybel_object = pybel.readstring('smi', name_SMILE) # append to Pybel object list Pybel_objects.append(Pybel_object) comp_num += 1 # number of unique species else: # if it's a species already encountered # index of component already listed name_indx = spec_namelist.index(name_only) # store product index # check if index already present - i.e. component appears more than once if sum(pindx[eqn_step, 0:product_step] == int(name_indx)) > 0: exist_indx = np.where(pindx[eqn_step, 0:product_step] == (int( name_indx))) # get pre-existing index of this component # add to pre-existing stoichometry pstoi[eqn_step, exist_indx] += pstoi[eqn_step, product_step] pstoi[eqn_step, product_step] = 0 # remove stoichometry added above product_step -= 1 # ignore this duplicate product else: pindx[eqn_step, product_step] = int(name_indx) product_step += 1 # number of products in this equation nprod[eqn_step] = int(product_step) return (rindx, rstoi, pindx, pstoi, reac_coef, spec_namelist, spec_list, Pybel_objects, nreac, nprod, comp_num)
(0.48640239E-1 * temp) + (0.41764768E-4 * (temp**2.0E0)) - (0.14452093E-7 * (temp**3.0E0)) + (0.65459673E1 * numpy.log(temp))) y_density_array.append(1000.0E0) #Append density of water to array [kg/m3] y_mw.append(18.0E0) #Append mw of water to array [g/mol] sat_vp.append(numpy.log10(sat_vap_water * 9.86923E-6)) #Convert Pa to atm Delta_H.append(40.66) Latent_heat_gas.append( Lv_water_vapour ) #Water vapour, taken from Paul Connolly's parcel model ACPIM num_species += 1 #We need to increase the number of species to account for water in the gas phase # Now also account for any change in species considered in condensed phase based on those that are ignored num_species_condensed = len(y_density_array) #Update the Pybel object libraries key = pybel.readstring('smi', 'O') Pybel_object_dict.update({'O': key}) #Pybel_object_activity.update({key:Water_Abun}) species_dict2array.update({'H2O': num_species - 1}) include_index.append(num_species - 1) #pdb.set_trace() ignore_index_fortran = numpy.append(ignore_index_fortran, 0.0) #pdb.set_trace() #------------------------------------------------------------------------------------- # 6) Now calculate the additional properties that dictate gas-to-particle partitioning [inc water] #------------------------------------------------------------------------------------- property_dict2 = Property_calculation.Pure_component2( num_species_condensed, y_mw, R_gas, temp) alpha_d_org = property_dict2['alpha_d_org'] DStar_org = property_dict2['DStar_org'] mean_them_vel = property_dict2['mean_them_vel']
"Pharm2D2point": CalculatePharm2D2pointFingerprint, "Pharm2D3point": CalculatePharm2D3pointFingerprint, "PubChem": CalculatePubChemFingerprint, "GhoseCrippen": CalculateGhoseCrippenFingerprint, } ################################################################ if __name__ == "__main__": print("-" * 10 + "START" + "-" * 10) ms = [ Chem.MolFromSmiles("CCOC=N"), Chem.MolFromSmiles("NC1=NC(=CC=N1)N1C=CC2=C1C=C(O)C=C2"), ] m2 = [pybel.readstring("smi", "CCOC=N"), pybel.readstring("smi", "CCO")] res1 = CalculateECFP4Fingerprint(ms[0]) print(res1) print("-" * 25) res2 = CalculateECFP4Fingerprint(ms[1]) print(res2) print("-" * 25) mol = pybel.readstring("smi", "CCOC=N") res3 = CalculateFP3Fingerprint(mol) print(res3) print("-" * 25) mol = Chem.MolFromSmiles("O=C1NC(=O)NC(=O)C1(C(C)C)CC=C") res4 = CalculatePharm2D2pointFingerprint(mol)[0] print(res4) print("-" * 25)
def dissim_run(org, ec, neg, k, pos=None, zinc=False, zinc_tol_l=1, zinc_tol_r=1, vl=None, simfp=fptr.integer_sim, target_bits=None, screen=None): # Collects isozyme data into the Isozyme class. a = bi(org, ec) bits = a.analyze_reactions() if pos: a.add_from_sdf(pos, k, pos=True) a.add_from_sdf(neg, k, pos=False) #Two branches here; one pulls potential test data from ZINC, another pulls from KEGG. res_ = [(page["smiles"], fptr.integer_fp(str(page["smiles"])), page["vendors"], page["_id"]) for page in dbq.zinc_pull(target_bits, a.mass_avg[k], a.mass_std[k], zinc_tol_l=zinc_tol_l, zinc_tol_r=zinc_tol_r) if u'R' not in page["smiles"] and 'vendors' in page] res_s = [rr for rr in res_ if rr[1] is not None] if screen is not None: patt = [pybel.Smarts(smarts) for smarts in screen.split('|')] if len(patt) > 2: raise IOError( 'al_run only supports OR filters for two SMARTS queries at this time.' ) res = [ rr for rr in res_s if len(patt[0].findall(pybel.readstring('smi', str(rr[0])))) > 0 or len(patt[1].findall(pybel.readstring('smi', str(rr[0])))) > 0 ] else: res = res_s x_pos_array = np.vstack(tuple([t[1] for t in a.pos[k]])) x_neg_array = np.vstack(tuple([t[1] for t in a.neg[k]])) x_array = np.vstack((x_pos_array, x_neg_array)) centroid = np.mean(x_array, axis=0) test_a = np.vstack(tuple([np.array(x[1]) for x in res if x[1] is not None])) test_centroid = np.mean(test_a, axis=0) tc_u = dw.avg_proximity(test_a, test_a, f=simfp) xis_a = [(x[0], fptr.integer_sim(centroid, x[1]), 1, x[2], x[3]) for x in res if x[1] is not None] xis_b = [(x[0], tc_u[i] * (-math.log(fptr.integer_sim(centroid, x[1]), 2)), 1, x[2], x[3]) for i, x in enumerate(res) if x[1] is not None] dw.generate_report(sorted(xis_a, key=lambda y: y[1]), vendors_list=vl, outfile="%s_ec%s_dissim_zinc%s%s.sdf" % (org, ec.replace('.', '_'), str(zinc_tol_l).replace( '.', '_'), str(zinc_tol_r).replace('.', '_'))) dw.generate_report(sorted(xis_b, key=lambda y: y[1]), vendors_list=vl, outfile="%s_ec%s_dissimcentral_zinc%s%s.sdf" % (org, ec.replace('.', '_'), str(zinc_tol_l).replace( '.', '_'), str(zinc_tol_r).replace('.', '_')))
def generateSvg(inchi, filename): if os.path.exists(filename): return mol = pybel.readstring('inchi', inchi) mol.write('svg', filename=filename)
def al_run(org, ec, neg, k, beta=1, pos=None, ent=False, kernel='rbf', degree=3, zinc=True, zinc_tol_l=1, zinc_tol_r=1, greedy=False, vl=None, simfp=fptr.integer_sim, C=5, target_bits=None, screen=None): #Collects isozyme data into the Isozyme class. a = bi(org, ec) if pos: a.add_from_sdf(pos, k, pos=True) a.add_from_sdf(neg, k, pos=False) #Two branches here; one pulls potential test data from ZINC, another pulls from KEGG. if zinc: res_ = [(page["smiles"], fptr.integer_fp(str(page["smiles"])), page["vendors"], page["_id"]) for page in dbq.zinc_pull(target_bits, a.mass_avg[k], a.mass_std[k], zinc_tol_l=zinc_tol_l, zinc_tol_r=zinc_tol_r) if u'R' not in page["smiles"] and 'vendors' in page] res_s = [rr for rr in res_ if rr[1] is not None] if screen is not None: patt = [pybel.Smarts(smarts) for smarts in screen.split('|')] if len(patt) > 2: raise IOError( 'al_run only supports OR filters for two SMARTS queries at this time.' ) res = [ rr for rr in res_s if len(patt[0].findall( pybel.readstring('smi', str(rr[0])))) > 0 or len(patt[1].findall(pybel.readstring('smi', str(rr[0])))) > 0 ] else: res = res_s else: res = [(page["SMILES"], np.array(fptr.integer_fp(str(page["SMILES"])))) for page in dbq.kegg_pull(target_bits) if u'R' not in page["SMILES"] and np.array(fptr.integer_fp(str(page["SMILES"]))) is not None] labels = machines.svm_clf(a.pos[k], a.neg[k], res, kernel=kernel, degree=degree, ent=ent, C=C) test_a = np.vstack( tuple([ np.array(x[1]) for x in res if x[1] is not None and len(x[1]) == 313 ])) tc_u = dw.avg_proximity(test_a, test_a, f=simfp) if greedy: if ent: xis = [ l * dw.weight(dw.entropy(p), tc_u[i], beta=beta) for i, (l, p) in enumerate(labels) ] else: xis = [ l * dw.weight(dw.hyper_distance(d), tc_u[i], beta=beta) for i, (l, d) in enumerate(labels) ] else: if ent: xis = [ dw.weight(dw.entropy(p), tc_u[i], beta=beta) for i, (l, p) in enumerate(labels) ] else: xis = [ dw.weight(dw.hyper_distance(d), tc_u[i], beta=beta) for i, (l, d) in enumerate(labels) ] if zinc: dw.generate_report( sorted(zip([s for s, fp, vend, z in res if fp is not None], xis, [lab[0] for lab in labels], [vend for s, fp, vend, z in res if fp is not None], [z for s, fp, vend, z in res if fp is not None]), key=lambda y: y[1], reverse=True), vendors_list=vl, outfile="%s_ec%s_beta%s_%s_zinc%s%s_C%s.sdf" % (org, ec.replace( '.', '_'), str(beta), kernel, str(zinc_tol_l).replace( '.', '_'), str(zinc_tol_r).replace('.', '_'), str(C))) f = open( "%s_ec%s_beta%s_%s_zinc%s%s_C%s.txt" % (org, ec.replace( '.', '_'), str(beta), kernel, str(zinc_tol_l).replace( '.', '_'), str(zinc_tol_r).replace('.', '_'), str(C)), 'w') else: dw.generate_report(sorted(zip([s for s, fp in res], xis, [lab[0] for lab in labels]), key=lambda y: y[1], reverse=True), outfile="%s_ec%s_beta%s_%s.sdf" % (org, ec.replace('.', '_'), str(beta), kernel), zinc=False) f = open( "%s_ec%s_beta%s_%s.txt" % (org, ec.replace('.', '_'), str(beta), kernel), 'w') for score in xis: f.write(str(score) + '\n') f.close()
def testKekulizationOfcn(self): """We were previously not reading 'cn' correctly, or at least how Daylight would""" mol = pybel.readstring("smi", "cn") self.assertEqual("C=N", mol.write("smi").rstrip())
except: try: df = pd.read_csv(file_path, sep=' ', dtype={ '#smiles': str, 'zinc_id': str }) smile_arr = smile_arr + df["#smiles"].tolist() id_arr = id_arr + df["zinc_id"].tolist() except: print("warning!!:", file_path) else: continue data = {"smile": smile_arr, "name": id_arr} df_out = pd.DataFrame(data) df_out = df_out.drop_duplicates("smile") df_out.to_csv("ZINC_UNIQUE_SMILE.csv", index=False) drug = df_out["smile"] canonical_smiles = [ pybel.readstring("smi", smile).write("can").rstrip() for smile in drug ] data = {"canonical_smile": canonical_smiles, "name": df_out['name']} df_out = pd.DataFrame(data) df_out = df_out.drop_duplicates("canonical_smile") df_out.to_csv("ZINC_UNIQUE_canonical.csv", index=False)
def enhance_structure_dict(structure_dict): """Add derived information to the structure dictionary. Args: structure_dict: Output of :func:`make_structure_dict`. Returns: dict: The same, modified in-place, with derived information (e.g. atom distances). Caution: If torch is imported at the same time as this is run, you may get a segmentation fault. Complain to pybel or rdkit, I suppose. """ import pybel for molecule_name in structure_dict: # positions - array (N,3) of Cartesian positions molecule = structure_dict[molecule_name] positions = np.array(molecule['positions']) n_atom = positions.shape[0] molecule['positions'] = positions # distances - array (N,N) of distances between atoms pos1 = np.tile(positions, (n_atom, 1, 1)) pos2 = np.transpose(pos1, (1, 0, 2)) dist = np.linalg.norm(pos1 - pos2, axis=-1) molecule['distances'] = dist # angle - array (N,) of angles to the 2 closest atoms sorted_j = np.argsort(dist, axis=-1) relpos1 = positions[sorted_j[:, 1], :] - positions[sorted_j[:, 0], :] relpos2 = positions[sorted_j[:, 2], :] - positions[sorted_j[:, 0], :] cos = np.sum(relpos1 * relpos2, axis=1) / ( np.linalg.norm(relpos1, axis=1) * np.linalg.norm(relpos2, axis=1)) angle = np.arccos(np.clip(cos, -1.0, 1.0)).reshape((n_atom, 1)) / np.pi molecule['angle'] = angle[:, 0] # bond orders - array (N,N) of the bond order (0 for no chemical bond) # Note this relies on a few manual corrections molecule['bond_orders'] = np.zeros((n_atom, n_atom)) atomicNumList = [ atomic_num_dict[symbol] for symbol in molecule['symbols'] ] if molecule_name in manual_bond_order_dict: molecule['bond_orders'] = np.array( manual_bond_order_dict[molecule_name], dtype=float) else: mol = x2m.xyz2mol(atomicNumList, 0, positions, True, True) for bond in mol.GetBonds(): atom0, atom1 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() bond_order = bond.GetBondType() molecule['bond_orders'][atom0, atom1] = bond_order_dict[bond_order] molecule['bond_orders'][atom1, atom0] = bond_order_dict[bond_order] # Supplementary information for tagging: # top_bonds: (N,4 or less) bond orders of the top 4 bonds, for each atom # bond_ids: (N,4): Label the atom with the following 4 linear transform of top_bonds: # * total num bonds (valence), counting double as 2 # * total num bonded neighbors, counting double as 1 # * largest order # * second largest order. molecule['top_bonds'] = np.sort(molecule['bond_orders'], axis=-1)[:, -1:-5:-1] molecule['bond_ids'] = np.hstack( (molecule['top_bonds'].sum(axis=-1)[:, np.newaxis], np.sum(molecule['top_bonds'] > 1e-3, axis=-1)[:, np.newaxis], molecule['top_bonds'][:, :2])) # long_symbols (N,) string relabel of the symbol straight from bond_ids molecule['long_symbols'] = [ '_'.join([molecule['symbols'][i]] + [str(x) for x in molecule['bond_ids'][i]]) for i in range(n_atom) ] chem_bond_atoms = [ sorted([ molecule['symbols'][i] for i in molecule['bond_orders'][atom_index].nonzero()[0] ]) for atom_index in range(n_atom) ] molecule['sublabel_atom'] = [ '-'.join([molecule['long_symbols'][atom_index]] + chem_bond_atoms[atom_index]) for atom_index in range(n_atom) ] # pybel information. I think we only end up using Gastiger charges. # Each of these is (N,) arrays # Convert to xyz string for pybel's I/O xyz = str(n_atom) + '\n\n' + '\n'.join([ ' '.join([ str(molecule['symbols'][i]), str(molecule['positions'][i, 0]), str(molecule['positions'][i, 1]), str(molecule['positions'][i, 2]) ]) for i in range(n_atom) ]) mol = pybel.readstring('xyz', xyz) molecule['charges'] = [ mol.atoms[i].partialcharge for i in range(n_atom) ] molecule['spins'] = [mol.atoms[i].spin for i in range(n_atom)] molecule['heavyvalences'] = [ mol.atoms[i].heavyvalence for i in range(n_atom) ] molecule['heterovalences'] = [ mol.atoms[i].heterovalence for i in range(n_atom) ] molecule['valences'] = [mol.atoms[i].valence for i in range(n_atom)] molecule['hyb_types'] = [mol.atoms[i].type for i in range(n_atom)] return structure_dict
def testNonexistentAtom(self): mol = pybel.readstring("smi", "ICBr") bv = self.createBitVec(10, (9, )) nmol = ob.OBMol() ok = mol.OBMol.CopySubstructure(nmol, bv) self.assertFalse(ok)
def get_inchi_molecule(accession): return pybel.readstring("inchi", accession)
import csv # Manually convert .db file to text file. Data file is a file which contains the output databases which have been # converted to text files and merged into one large file. dataFile1 = sys.argv[1] dataSet1 = open(dataFile1) all_monomer_pairs = [] for line in dataSet1.readlines(): # Pull SMILES #smiles = line.split("_")[0].split("~") smiles = line.split("\"")[1].split("_")[0].split("~") # Convert SMILES to canonical SMILES mol_1 = pybel.readstring("smi", smiles[0]) canmol1 = mol_1.write("can").split("\t")[0] mol_2 = pybel.readstring("smi", smiles[1]) canmol2 = mol_2.write("can").split("\t")[0] # Make a set containing the 2 canonical SMILES and save the set to a list of all monomer pairs all_monomer_pairs.append({canmol1, canmol2}) # Generates a list of unique pairs with the number of occurrances of that monomer (Ex: Monomer_A 5) unique_data = [list(x) for x in set(frozenset(tuple(x)) for x in all_monomer_pairs)] monomer_pair_counts = [] for pair in unique_data: counts = all_monomer_pairs.count(set(pair)) monomer_pair_counts.append([pair,counts]) sorted_pairs = sorted(monomer_pair_counts, key=lambda tup: tup[1], reverse=True)
ff.ConjugateGradients(250, 1.0e-3) ff.WeightedRotorSearch(250, 5) ff.WeightedRotorSearch(250, 10) ff.ConjugateGradients(100, 1.0e-5) ff.GetCoordinates(mol.OBMol) if __name__ == "__main__": # iterate through all the files, all the molecules in the files and optimize for argument in sys.argv[1:]: with open(argument) as f: for line in f: ikey, smi = line.split() try: mol = pybel.readstring("smi", smi) except IOError: continue mol = cirpy.Molecule(ikey, ['inchikey']) filename = "library/%s/%s/%s.mol2" % (ikey[0], ikey[1], ikey) if not os.path.isfile(filename): if mol.twirl_url is not None: mol.download(filename, 'mol2', True) else: globalopt(mol) mkpath('library/%s/%s' % (ikey[0], ikey[1])) mol.write("mol2", filename)
def eqn_interr(num_eqn, eqn_list, aqeqn_list, chem_scheme_markers, comp_name, comp_smil, num_sb, wall_on): # inputs: ---------------------------------------------------------------------------- # num_eqn - number of equations # eqn_list - gas-phase equations in list of strings # aqeqn_list - aqueous-phase equations in list of strings # chem_scheme_markers - markers for separating sections of the chemical scheme # comp_name - name string of components in xml file (not SMILES) # comp_smil - SMILES from xml file # num_sb - number of size bins # wall_on - marker for whether to include wall partitioning # ------------------------------------------------------------------------------------ # preparatory part ---------------------------------------------------- # matrix to record indices of reactants (cols) in each equation (rows) rindx = np.zeros((num_eqn[0], 1)).astype(int) # matrix of indices to arrange reactant concentrations when # reaction rate coefficient calculated y_arr = (np.ones((num_eqn[0], 1)).astype(int)) * -9999 # array to arrange reaction rates so they align with reactant stoichiometries rr_arr = np.empty((0)) # same but for products rr_arr_p = np.empty((0)) # index array for extracting required reactant concentrations for the # reaction rate coefficient calculation y_rind = np.empty((0)) # index array for identifying products when assigning gains from reactions y_pind = np.empty((0)) # matrix to record indices of products (cols) in each equation (rows) pindx = np.zeros((num_eqn[0], 1)).astype(int) # matrix to record stoichiometries of reactants (cols) in each equation (rows) rstoi = np.zeros((num_eqn[0], 1)) jac_stoi = np.zeros((num_eqn[0], 1)) # 1D array to record stoichiometries of reactants per equarion rstoi_flat = np.empty((0)) # 1D array to record stoichiometries of products per equarion pstoi_flat = np.empty((0)) # matrix to record stoichiometries of products (cols) in each equation (rows) pstoi = np.zeros((num_eqn[0], 1)) # arrays to store number of reactants and products in gas-phase equations nreac = np.empty(num_eqn[0], dtype=np.int8) nprod = np.empty(num_eqn[0], dtype=np.int8) # colptrs for sparse matrix reac_col = np.empty(num_eqn[0], dtype=np.int8) prod_col = np.empty(num_eqn[0], dtype=np.int8) # list for equation reaction rate coefficients reac_coef = [] # matrix containing index of components who are denominators in the # calculation of equation derivatives in the Jacobian jac_den_indx = np.zeros((num_eqn[0], 1)) # total number of Jacobian elements per equation njac = np.zeros((num_eqn[0], 1)) # indices of Jacobian to affect per equation (rows) jac_indx = np.zeros((num_eqn[0], 1)) # a new list for the name strings of components presented in the scheme (not SMILES) comp_namelist = [] comp_list = [ ] # list for the SMILE strings of components present in the chemical scheme # list of Pybel objects of components in chemical scheme Pybel_objects = [] comp_num = 0 # count the number of unique components in the chemical scheme RO_indx = [] # empty list for holding indices of alkoxy components # --------------------------------------------------------------------- max_no_reac = 0. # log maximum number of reactants in a reaction max_no_prod = 0. # log maximum number of products in a reaction # loop through gas-phase equations line by line and extract the required information for eqn_step in range(num_eqn[0]): line = eqn_list[eqn_step] # extract this line # work out whether equation or reaction rate coefficient part comes first eqn_start = str('.*\\' + chem_scheme_markers[10]) rrc_start = str('.*\\' + chem_scheme_markers[9]) # get index of these markers, note span is the property of the match object that # gives the location of the marker eqn_start_indx = (re.match(eqn_start, line)).span()[1] rrc_start_indx = (re.match(rrc_start, line)).span()[1] if (eqn_start_indx > rrc_start_indx): eqn_sec = 1 # equation is second part else: eqn_sec = 0 # equation is first part # split the line into 2 parts: equation and rate coefficient # . means match with anything except a new line character., when followed by a * # means match zero or more times (so now we match with all characters in the line # except for new line characters, so final part is stating the character(s) we # are specifically looking for, \\ ensures the marker is recognised if eqn_sec == 1: eqn_markers = str('\\' + chem_scheme_markers[10] + '.*\\' + chem_scheme_markers[11]) else: # end of equation part is start of reaction rate coefficient part eqn_markers = str('\\' + chem_scheme_markers[10] + '.*\\' + chem_scheme_markers[9]) # extract the equation as a string ([0] extracts the equation section and # [1:-1] removes the bounding markers) eqn = re.findall(eqn_markers, line)[0][1:-1].strip() eqn_split = eqn.split() eqmark_pos = eqn_split.index('=') # reactants with stoichiometry number and omit any photon reactants = [ i for i in eqn_split[:eqmark_pos] if i != '+' and i != 'hv' ] # products with stoichiometry number products = [t for t in eqn_split[eqmark_pos + 1:] if t != '+'] # record maximum number of reactants across all equations max_no_reac = np.maximum(len(reactants), max_no_reac) # record maximum number of products across all equations max_no_prod = np.maximum(len(products), max_no_prod) # append columns if needed because maximum number of reactants increases while (max_no_reac > np.minimum(rindx.shape[1], rstoi.shape[1])): rindx = np.append(rindx, (np.zeros((num_eqn[0], 1))).astype(int), axis=1) rstoi = np.append(rstoi, (np.zeros((num_eqn[0], 1))), axis=1) y_arr = np.append(y_arr, (np.ones( (num_eqn[0], 1)) * -9999).astype(int), axis=1) y_arr_fixer = ((np.arange(0, num_eqn[0], dtype='int')).reshape(-1, 1)) y_arr_fixer = np.tile(y_arr_fixer, (1, int(max_no_reac))) y_arr[y_arr != -9999] = y_arr[y_arr != -9999] + y_arr_fixer[y_arr != -9999] while (max_no_prod > np.minimum(pindx.shape[1], pstoi.shape[1])): pindx = np.append(pindx, (np.zeros((num_eqn[0], 1))).astype(int), axis=1) pstoi = np.append(pstoi, (np.zeros((num_eqn[0], 1))), axis=1) while ((len(reactants)**2.0 + len(reactants) * len(products)) > jac_indx.shape[1]): jac_indx = np.append(jac_indx, (np.zeros((num_eqn[0], 1))), axis=1) jac_den_indx = np.append(jac_den_indx, (np.zeros((num_eqn[0], 1))), axis=1) jac_stoi = np.append(jac_stoi, (np.zeros((num_eqn[0], 1))), axis=1) # .* means occurs anywhere in line and, first \ means second \ can be interpreted # and second \ ensures recognition of marker rate_coeff_start_mark = str('\\' + chem_scheme_markers[9]) # . means match with anything except a new line character, when followed by a * # means match zero or more times (so now we match with all characters in the line # except for new line characters, \\ ensures the marker # is recognised if eqn_sec == 1: # end of reaction rate coefficient part is start of equation part rate_coeff_end_mark = str('.*\\' + chem_scheme_markers[10]) else: # end of reaction rate coefficient part is end of line rate_coeff_end_mark = str('.*\\' + chem_scheme_markers[11]) # rate coefficient starts and end punctuation rate_regex = str(rate_coeff_start_mark + rate_coeff_end_mark) # rate coefficient expression in a string rate_ex = re.findall(rate_regex, line)[0][1:-1].strip() # convert fortran-type scientific notation to python type rate_ex = formatting.SN_conversion(rate_ex) # convert the rate coefficient expressions into Python readable commands rate_ex = formatting.convert_rate_mcm(rate_ex) if (rate_ex.find('EXP') != -1): print('Error in reaction rate coefficient expression: ', rate_ex) sys.exit() # store the reaction rate coefficient for this equation # (/s once any inputs applied) reac_coef.append(rate_ex) # extract the stoichiometric number of the component in current equation reactant_step = 0 product_step = 0 stoich_regex = r"^\d*\.\d*|^\d*" numr = len(reactants) # number of reactants in this equation # left hand side of equations (losses) for reactant in reactants: if (re.findall(stoich_regex, reactant)[0] != ''): stoich_num = float(re.findall(stoich_regex, reactant)[0]) # name with no stoich number name_only = re.sub(stoich_regex, '', reactant) elif (re.findall(stoich_regex, reactant)[0] == ''): stoich_num = 1. name_only = reactant # store stoichiometry rstoi[eqn_step, reactant_step] = stoich_num jac_stoi[eqn_step, reactant_step] = -1 * stoich_num if name_only not in comp_namelist: # if new component encountered comp_namelist.append( name_only) # add to chemical scheme name list # convert MCM chemical names to SMILES # index where xml file name matches reaction component name name_indx = comp_name.index(name_only) name_SMILE = comp_smil[name_indx] # SMILES of component comp_list.append(name_SMILE) # list SMILE names name_indx = comp_num # allocate index to this species # generate pybel object Pybel_object = pybel.readstring('smi', name_SMILE) # append to Pybel object list Pybel_objects.append(Pybel_object) # check if alkoxy radical present in this component and that component is organic if ('[O]' in name_SMILE): if ('C' in name_SMILE or 'C' in name_SMILE): if (name_SMILE != 'C[O]'): # ensure it's not carbon monoxide # if it is an organic alkoxy radical add its index to list RO_indx.append(comp_num) comp_num += 1 # number of unique species else: # if it is a component already encountered it will be in comp_list # existing index name_indx = comp_namelist.index(name_only) # store reactant index # check if index already present - i.e. component appears more than once if sum(rindx[eqn_step, 0:reactant_step] == int(name_indx)) > 0: # get existing index of this component exist_indx = (np.where( rindx[eqn_step, 0:reactant_step] == (int(name_indx))))[0] # add to existing stoichiometry rstoi[eqn_step, exist_indx] += rstoi[eqn_step, reactant_step] jac_stoi[eqn_step, exist_indx] += -1 * rstoi[eqn_step, reactant_step] # remove stoichiometry added above rstoi[eqn_step, reactant_step] = 0 jac_stoi[eqn_step, reactant_step] = 0 reactant_step -= 1 # ignore this duplicate else: rindx[eqn_step, reactant_step] = int(name_indx) y_arr[eqn_step, reactant_step] = int((eqn_step * max_no_reac) + reactant_step) y_rind = np.append(y_rind, int(name_indx)) rr_arr = np.append(rr_arr, int(eqn_step)) reactant_step += 1 # number of reactants in this equation nreac[eqn_step] = int(reactant_step) # record 1D array of stoichiometries per equation rstoi_flat = np.append(rstoi_flat, rstoi[eqn_step, 0:int(reactant_step)]) # right hand side of equations (gains) for product in products: if (re.findall(stoich_regex, product)[0] != ''): stoich_num = float(re.findall(stoich_regex, product)[0]) name_only = re.sub(stoich_regex, '', product) # name with no stoich number elif (re.findall(stoich_regex, product)[0] == ''): stoich_num = 1. name_only = product # store stoichiometry pstoi[eqn_step, product_step] = stoich_num jac_stoi[eqn_step, reactant_step + product_step] = 1 * stoich_num if name_only not in comp_namelist: # if new component encountered comp_namelist.append(name_only) # convert MCM chemical names to SMILES # index where xml file name matches reaction component name name_indx = comp_name.index(name_only) name_SMILE = comp_smil[name_indx] comp_list.append( name_SMILE) # list SMILE string of parsed species name_indx = comp_num # allocate index to this species # Generate pybel Pybel_object = pybel.readstring('smi', name_SMILE) # append to Pybel object list Pybel_objects.append(Pybel_object) # check if alkoxy radical present in this component and that component is organic if ('[O]' in name_SMILE): if ('C' in name_SMILE or 'C' in name_SMILE): if (name_SMILE != 'C[O]'): # ensure it's not carbon monoxide # if it is an organic alkoxy radical add its index to list RO_indx.append(comp_num) comp_num += 1 # number of unique species else: # if it's a species already encountered # index of component already listed name_indx = comp_namelist.index(name_only) # store product index # check if index already present - i.e. component appears more than once if sum(pindx[eqn_step, 0:product_step] == int(name_indx)) > 0: # get existing index of this component exist_indx = (np.where( pindx[eqn_step, 0:product_step] == (int(name_indx))))[0] # add to existing stoichiometry pstoi[eqn_step, exist_indx] += pstoi[eqn_step, product_step] jac_stoi[eqn_step, reactant_step + exist_indx] += 1 * pstoi[eqn_step, product_step] # remove stoichiometry added above pstoi[eqn_step, product_step] = 0 jac_stoi[eqn_step, reactant_step + product_step] = 0 product_step -= 1 # ignore this duplicate else: pindx[eqn_step, product_step] = int(name_indx) rr_arr_p = np.append(rr_arr_p, int(eqn_step)) y_pind = np.append(y_pind, int(name_indx)) product_step += 1 # number of products in this equation nprod[eqn_step] = int(product_step) # record 1D array of stoichiometries per equation pstoi_flat = np.append(pstoi_flat, pstoi[eqn_step, 0:int(product_step)]) # now that total number of components (reactants and products) # in an equation is known, replicate the reactant indices over all # components tot_comp = nreac[eqn_step] + nprod[eqn_step] for i in range(nreac[eqn_step]): jac_den_indx[eqn_step, i * tot_comp:(i + 1) * tot_comp] = rindx[eqn_step, i] # also replicate the stoichiometries for every reactant if (i > 0): jac_stoi[eqn_step, i * tot_comp:(i + 1) * tot_comp] = jac_stoi[eqn_step, 0:tot_comp] # number of Jacobian elements affected by this equation njac[eqn_step, 0] = tot_comp * nreac[eqn_step] # remove fillers and flatten index for arranging concentrations # ready for reaction rate coefficient calculation y_arr_g = y_arr[y_arr != -9999] y_rind_g = y_rind.astype(int) # ensure integer type uni_y_rind_g = (np.unique(y_rind)).astype(int) # unique index of reactants y_pind_g = y_pind.astype(int) # ensure integer type uni_y_pind_g = (np.unique(y_pind)).astype(int) # unique index of products rr_arr_g = rr_arr.astype(int) # ensure integer type rr_arr_p_g = rr_arr_p.astype(int) # ensure integer type # colptrs for sparse matrix of the change to reactants per equation reac_col_g = np.cumsum(nreac) - nreac # colptrs for sparse matrix of the change to products per equation prod_col_g = np.cumsum(nprod) - nprod if (len(reac_col_g) > 0): # if gas-phase reaction present # include final columns reac_col_g = np.append(reac_col_g, reac_col_g[-1] + nreac[-1]) prod_col_g = np.append(prod_col_g, prod_col_g[-1] + nprod[-1]) # tag other gas-phase arrays rindx_g = rindx pindx_g = pindx rstoi_g = rstoi pstoi_g = pstoi jac_stoi_g = jac_stoi rstoi_flat_g = rstoi_flat pstoi_flat_g = pstoi_flat nreac_g = nreac nprod_g = nprod reac_coef_g = reac_coef jac_den_indx_g = jac_den_indx.astype(int) njac_g = njac.astype(int) jac_indx_g = jac_indx jac_indx_g = jac_indx_g.astype(int) # same for aqueous-phase reactions ---------------------------------- # preparatory part ---------------------------------------------------- # matrix to record indices of reactants (cols) in each equation (rows) rindx = (np.ones((num_eqn[1], 1)) * -2).astype(int) # matrix of indices to arrange reactant concentrations when # reaction rate coefficient calculated y_arr = (np.ones((num_eqn[1], 1)).astype(int)) * -9999 # array to arrange reaction rates so they align with reactant stoichiometries rr_arr = np.empty((0)) # same but for products rr_arr_p = np.empty((0)) # index array for extracting required reactant concentrations for the # reaction rate coefficient calculation y_rind = np.empty((0)) # index array for identifying products when assigning gains from reactions y_pind = np.empty((0)) # matrix to record indices of products (cols) in each equation (rows) pindx = np.zeros((num_eqn[1], 1)).astype(int) # matrix to record stoichiometries of reactants (cols) in each equation (rows) rstoi = np.zeros((num_eqn[1], 1)) jac_stoi = np.zeros((num_eqn[1], 1)) # 1D array to record stoichiometries of reactants per equation rstoi_flat = np.empty((0)) # 1D array to record stoichiometries of products per equation pstoi_flat = np.empty((0)) # matrix to record stoichiometries of products (cols) in each equation (rows) pstoi = np.zeros((num_eqn[1], 1)) # arrays to store number of reactants and products of equations nreac = np.empty(num_eqn[1], dtype=np.int8) nprod = np.empty(num_eqn[1], dtype=np.int8) # list for equation reaction rate coefficients reac_coef = [] # matrix containing index of components who are denominators in the # calculation of equation derivatives in the Jacobian jac_den_indx = np.zeros((num_eqn[1], 1)) # total number of Jacobian elements per equation njac = np.zeros((num_eqn[1], 1)) # indices of Jacobian to affect per equation (rows) jac_indx = np.zeros((num_eqn[1], 1)) # --------------------------------------------------------------------- max_no_reac = 0. # log maximum number of reactants in a reaction max_no_prod = 0. # log maximum number of products in a reaction # loop through aqueous-phase equations line by line and extract the required information for eqn_step in range(num_eqn[1]): line = aqeqn_list[eqn_step] # extract this line # work out whether equation or reaction rate coefficient part comes first eqn_start = str('.*\\' + chem_scheme_markers[10]) rrc_start = str('.*\\' + chem_scheme_markers[9]) # get index of these markers, note span is the property of the match object that # gives the location of the marker eqn_start_indx = (re.match(eqn_start, line)).span()[1] rrc_start_indx = (re.match(rrc_start, line)).span()[1] if eqn_start_indx > rrc_start_indx: eqn_sec = 1 # equation is second part else: eqn_sec = 0 # equation is first part # split the line into 2 parts: equation and rate coefficient # . means match with anything except a new line character., when followed by a * # means match zero or more times (so now we match with all characters in the line # except for new line characters, so final part is stating the character(s) we # are specifically looking for, \\ ensures the marker is recognised if eqn_sec == 1: eqn_markers = str('\\' + chem_scheme_markers[10] + '.*\\' + chem_scheme_markers[11]) else: # end of equation part is start of reaction rate coefficient part eqn_markers = str('\\' + chem_scheme_markers[10] + '.*\\' + chem_scheme_markers[9]) # extract the equation as a string ([0] extracts the equation section and # [1:-1] removes the bounding markers) eqn = re.findall(eqn_markers, line)[0][1:-1].strip() eqn_split = eqn.split() eqmark_pos = eqn_split.index('=') # with stoich number; rule out the photon reactants = [ i for i in eqn_split[:eqmark_pos] if i != '+' and i != 'hv' ] products = [t for t in eqn_split[eqmark_pos + 1:] if t != '+'] # with stoich number # record maximum number of reactants across all equations max_no_reac = np.maximum(len(reactants), max_no_reac) # record maximum number of products across all equations max_no_prod = np.maximum(len(products), max_no_prod) # append columns if needed while max_no_reac > np.minimum(rindx.shape[1], rstoi.shape[1]): rindx = np.append(rindx, (np.ones( (num_eqn[1], 1)) * -2).astype(int), axis=1) rstoi = np.append(rstoi, (np.zeros((num_eqn[1], 1))), axis=1) y_arr = np.append(y_arr, (np.ones( (num_eqn[1], 1)) * -9999).astype(int), axis=1) y_arr_fixer = ((np.arange(0, num_eqn[1], dtype='int')).reshape(-1, 1)) y_arr_fixer = np.tile(y_arr_fixer, (1, int(max_no_reac))) y_arr[y_arr != -9999] = y_arr[y_arr != -9999] + y_arr_fixer[y_arr != -9999] while max_no_prod > np.minimum(pindx.shape[1], pstoi.shape[1]): pindx = np.append(pindx, (np.zeros((num_eqn[1], 1))).astype(int), axis=1) pstoi = np.append(pstoi, (np.zeros((num_eqn[1], 1))), axis=1) while ((len(reactants)**2.0 + len(reactants) * len(products)) > jac_indx.shape[1]): jac_indx = np.append(jac_indx, (np.zeros((num_eqn[1], 1))), axis=1) jac_den_indx = np.append(jac_den_indx, (np.zeros((num_eqn[1], 1))), axis=1) jac_stoi = np.append(jac_stoi, (np.zeros((num_eqn[1], 1))), axis=1) # .* means occurs anywhere in line and, first \ means second \ can be interpreted # and second \ ensures recognition of marker rate_coeff_start_mark = str('\\' + chem_scheme_markers[9]) # . means match with anything except a new line character, when followed by a * # means match zero or more times (so now we match with all characters in the line # except for new line characters, \\ ensures the marker # is recognised if eqn_sec == 1: # end of reaction rate coefficient part is start of equation part rate_coeff_end_mark = str('.*\\' + chem_scheme_markers[10]) else: # end of reaction rate coefficient part is end of line rate_coeff_end_mark = str('.*\\' + chem_scheme_markers[11]) # rate coefficient starts and end punctuation rate_regex = str(rate_coeff_start_mark + rate_coeff_end_mark) # rate coefficient expression in a string rate_ex = re.findall(rate_regex, line)[0][1:-1].strip() # convert fortran-type scientific notation to python type rate_ex = formatting.SN_conversion(rate_ex) # convert the rate coefficient expressions into Python readable commands rate_ex = formatting.convert_rate_mcm(rate_ex) if (rate_ex.find('EXP') != -1): print('Error in reaction rate coefficient expression: ', rate_ex) sys.exit() # store the reaction rate coefficient for this equation # (/s once any inputs applied) reac_coef.append(rate_ex) # extract the stoichiometric number of the component in current equation reactant_step = 0 product_step = 0 stoich_regex = r"^\d*\.\d*|^\d*" numr = len(reactants) # number of reactants in this equation # left hand side of equations (losses) for reactant in reactants: if (re.findall(stoich_regex, reactant)[0] != ''): stoich_num = float(re.findall(stoich_regex, reactant)[0]) # name with no stoich number name_only = re.sub(stoich_regex, '', reactant) elif (re.findall(stoich_regex, reactant)[0] == ''): stoich_num = 1.0 name_only = reactant # store stoichiometry rstoi[eqn_step, reactant_step] = stoich_num jac_stoi[eqn_step, reactant_step] = -1 * stoich_num if name_only not in comp_namelist: # if new component encountered comp_namelist.append( name_only) # add to chemical scheme name list # convert MCM chemical names to SMILES if name_only in comp_name: # index where xml file name matches reaction component name name_indx = comp_name.index(name_only) name_SMILE = comp_smil[name_indx] # SMILES of component else: print( str('Error: inside eqn_parser, chemical scheme name ' + str(name_only) + ' not found in xml file')) sys.exit() comp_list.append(name_SMILE) # list SMILE names name_indx = comp_num # allocate index to this species # Generate pybel Pybel_object = pybel.readstring('smi', name_SMILE) # append to Pybel object list Pybel_objects.append(Pybel_object) # check if alkoxy radical present in this component and that component is organic if ('[O]' in name_SMILE): if ('C' in name_SMILE or 'C' in name_SMILE): if (name_SMILE != 'C[O]'): # ensure it's not carbon monoxide # if it is an organic alkoxy radical add its index to list RO_indx.append(comp_num) comp_num += 1 # number of unique species else: # if it's a species already encountered it will be in comp_list # existing index name_indx = comp_namelist.index(name_only) # store reactant index # check if index already present - i.e. component appears more than once # as a reactant in this reaction if sum(rindx[eqn_step, 0:reactant_step] == int(name_indx)) > 0: # get existing index of this component exist_indx = (np.where( rindx[eqn_step, 0:reactant_step] == (int(name_indx))))[0] # add to existing stoichiometry rstoi[eqn_step, exist_indx] += rstoi[eqn_step, reactant_step] jac_stoi[eqn_step, exist_indx] += -1 * rstoi[eqn_step, reactant_step] # remove stoichiometry added above rstoi[eqn_step, reactant_step] = 0 jac_stoi[eqn_step, reactant_step] = 0 reactant_step -= 1 # ignore this duplicate else: rindx[eqn_step, reactant_step] = int(name_indx) y_arr[eqn_step, reactant_step] = int((eqn_step * max_no_reac) + reactant_step) y_rind = np.append(y_rind, int(name_indx)) rr_arr = np.append(rr_arr, int(eqn_step)) reactant_step += 1 # number of reactants in this equation nreac[eqn_step] = int(reactant_step) # record 1D array of stoichiometries per equation rstoi_flat = np.append(rstoi_flat, rstoi[eqn_step, 0:int(reactant_step)]) # right hand side of equations (gains) for product in products: if (re.findall(stoich_regex, product)[0] != ''): stoich_num = float(re.findall(stoich_regex, product)[0]) name_only = re.sub(stoich_regex, '', product) # name with no stoich number elif (re.findall(stoich_regex, product)[0] == ''): stoich_num = 1.0 name_only = product # store stoichiometry pstoi[eqn_step, product_step] = stoich_num jac_stoi[eqn_step, reactant_step + product_step] = 1 * stoich_num if name_only not in comp_namelist: # if new component encountered comp_namelist.append(name_only) # convert MCM chemical names to SMILES # index where xml file name matches reaction component name if name_only in comp_name: name_indx = comp_name.index(name_only) name_SMILE = comp_smil[name_indx] else: print('Error: inside eqn_interr, chemical scheme name ' + str(name_only) + ' not found in xml file') sys.exit() comp_list.append( name_SMILE) # list SMILE string of parsed species name_indx = comp_num # allocate index to this species # generate pybel object Pybel_object = pybel.readstring('smi', name_SMILE) # append to Pybel object list Pybel_objects.append(Pybel_object) # check if alkoxy radical present in this component and that component is organic if ('[O]' in name_SMILE): if ('C' in name_SMILE or 'C' in name_SMILE): if (name_SMILE != 'C[O]'): # ensure it's not carbon monoxide # if it is an organic alkoxy radical add its index to list RO_indx.append(comp_num) comp_num += 1 # number of unique species else: # if it's a species already encountered # index of component already listed name_indx = comp_namelist.index(name_only) # store product index # check if index already present - i.e. component appears more than once if sum(pindx[eqn_step, 0:product_step] == int(name_indx)) > 0: # get existing index of this component exist_indx = (np.where( pindx[eqn_step, 0:product_step] == (int(name_indx))))[0] # add to existing stoichiometry pstoi[eqn_step, exist_indx] += pstoi[eqn_step, product_step] jac_stoi[eqn_step, reactant_step + exist_indx] += 1 * pstoi[eqn_step, product_step] # remove stoichiometry added above pstoi[eqn_step, product_step] = 0 jac_stoi[eqn_step, reactant_step + product_step] = 0 product_step -= 1 # ignore this duplicate else: pindx[eqn_step, product_step] = int(name_indx) rr_arr_p = np.append(rr_arr_p, int(eqn_step)) y_pind = np.append(y_pind, int(name_indx)) product_step += 1 # number of products in this equation nprod[eqn_step] = int(product_step) # record 1D array of stoichiometries per equation pstoi_flat = np.append(pstoi_flat, pstoi[eqn_step, 0:int(product_step)]) # now that total number of components (reactants and products) # in an equation is known, replicate the reactant indices over all # components tot_comp = nreac[eqn_step] + nprod[eqn_step] for i in range(nreac[eqn_step]): jac_den_indx[eqn_step, i * tot_comp:(i + 1) * tot_comp] = rindx[eqn_step, i] # also replicate the stoichiometries for every reactant if (i > 0): jac_stoi[eqn_step, i * tot_comp:(i + 1) * tot_comp] = jac_stoi[eqn_step, 0:tot_comp] # number of Jacobian elements affected by this equation njac[eqn_step, 0] = tot_comp * nreac[eqn_step] # account for gas-phase in Jacobian denominator index jac_den_indx += (comp_num + 2) # remove fillers and flatten index for arranging concentrations ready for reaction rate coefficient calculation y_arr_aq = y_arr[y_arr != -9999] # remove fillers y_rind_aq = y_rind.astype(int) # ensure integer type uni_y_rind_aq = (np.unique(y_rind)).astype( int) # unique index of reactants y_pind_aq = y_pind.astype(int) # ensure integer type uni_y_pind_aq = (np.unique(y_pind)).astype(int) # unique index of products rr_arr_aq = rr_arr.astype(int) # ensure integer type rr_arr_p_aq = rr_arr_p.astype(int) # ensure integer type # colptrs for sparse matrix of the change to reactants per equation reac_col_aq = np.cumsum(nreac) - nreac # colptrs for sparse matrix of the change to products per equation prod_col_aq = np.cumsum(nprod) - nprod if (len(reac_col_aq) > 0): # if aqueous-phase reaction present # include final columns reac_col_aq = np.append(reac_col_aq, reac_col_aq[-1] + nreac[-1]) prod_col_aq = np.append(prod_col_aq, prod_col_aq[-1] + nprod[-1]) # tag other aqueous-phase arrays rindx_aq = rindx pindx_aq = pindx rstoi_aq = rstoi pstoi_aq = pstoi jac_stoi_aq = jac_stoi rstoi_flat_aq = rstoi_flat pstoi_flat_aq = pstoi_flat nreac_aq = nreac nprod_aq = nprod reac_coef_aq = reac_coef jac_den_indx_aq = jac_den_indx.astype(int) njac_aq = njac.astype(int) jac_indx_aq = jac_indx jac_indx_aq = jac_indx_aq.astype(int) return (rindx_g, rstoi_g, pindx_g, pstoi_g, reac_coef_g, nreac_g, nprod_g, jac_stoi_g, jac_den_indx_g, njac_g, jac_indx_g, y_arr_g, y_rind_g, uni_y_rind_g, y_pind_g, uni_y_pind_g, reac_col_g, prod_col_g, rstoi_flat_g, pstoi_flat_g, rr_arr_g, rr_arr_p_g, rindx_aq, rstoi_aq, pindx_aq, pstoi_aq, reac_coef_aq, nreac_aq, nprod_aq, jac_stoi_aq, jac_den_indx_aq, njac_aq, jac_indx_aq, y_arr_aq, y_rind_aq, uni_y_rind_aq, y_pind_aq, uni_y_pind_aq, reac_col_aq, prod_col_aq, rstoi_flat_aq, pstoi_flat_aq, rr_arr_aq, rr_arr_p_aq, comp_namelist, comp_list, Pybel_objects, comp_num, RO_indx)
if __name__=="__main__": for idx, sdf_dataset in enumerate(DATA_SETS): logp_dataset = dict() database = pybel.readfile('sdf', sdf_dataset) #read the molecules in the sdf files for sd_record in database: mol_id = sd_record.data['MOLECULEID'] file_path = mol2_file_path[idx] + mol_id+'.mol2' molecule_coords = get_coords(file_path) #molecule.data.keys() gives all the properties molecule = pybel.readstring("smi", sd_record.data['SMILES']) #add hydrogen molecule.OBMol.AddHydrogens() #minimize the energy molecule.make3D(forcefield="gaff", steps=STEPS) molecule.localopt(forcefield="gaff", steps=STEPS) #get the coordinates molecule_coords = [] for atom in molecule.atoms: molecule_coords.append(atom.coords) # #save in the data set logp_dataset[mol_id] = {'logp':float(sd_record.data['logPow {measured}']), 'coords':np.array(molecule_coords)} molecule.write("pdb", f"{databae_path}/pdbs/{sd_record.data['MOLECULEID']}.pdb")
[comp_smil, comp_name] = xml_interr.xml_interr(str(cwd + '/example_xml.xml')) # convert chemical scheme component names into SMILEs comp_smiles = [] # holder for name in comp_names[0:-2]: # omit H20 and core at end of comp_names comp_smiles.append(comp_smil[comp_name.index(name)]) SOA0 = 0. for i in range(1, num_sb): # calculate SOA (*1.0E-12 to convert from g/cc (air) to ug/m3 (air)) SOA0 += (((y[:, num_comp*i:num_comp*(i+1)-2]/si.N_A)*y_mw[0:-2]*1.0e12).sum(axis=1)) Pybel_objects = [] # holder for Pybel object names for i in range(num_comp-2): # component loop # generate pybel object Pybel_objects.append(pybel.readstring('smi', comp_smiles[i])) # point to umansysprop folder sys.path.insert(1, (cwd + '/umansysprop')) # address for updated version from umansysprop import boiling_points from umansysprop import vapour_pressures from umansysprop import liquid_densities NA = si.Avogadro # Avogadro's number (molecules/mol) # vapour pressures of components, excluding water and core at end Psat = np.zeros((1, num_comp-2)) TEMP = 298.15 # temperature (K) for i in range(num_comp-2): # component loop