def main(): if not oefastrocs.OEFastROCSIsGPUReady(): oechem.OEThrow.Info("No supported GPU available!") return 0 args = getargs() dbname = args.d # read in database ifs = oechem.oemolistream() if not ifs.open(dbname): oechem.OEThrow.Fatal("Unable to open '%s'" % dbname) print("Opening database file %s ..." % dbname) timer = oechem.OEWallTimer() opts = oefastrocs.OEShapeDatabaseOptions() opts.SetLimit(1) dbase = oefastrocs.OEShapeDatabase() moldb = oechem.OEMolDatabase() if not moldb.Open(ifs): oechem.OEThrow.Fatal("Unable to open '%s'" % dbname) dots = oechem.OEThreadedDots(10000, 200, "conformers") if not dbase.Open(moldb, dots): oechem.OEThrow.Fatal("Unable to initialize OEShapeDatabase on '%s'" % dbname) dots.Total() print("%f seconds to load database" % timer.Elapsed()) df = pd.read_csv(args.i) res = [] for smile in tqdm(df.loc[:, 'smiles'].tolist()): resn = len(res) try: q = FromString(smile)[0] for score in dbase.GetSortedScores(q, 1): res.append(score.GetTanimotoCombo()) break except KeyboardInterrupt: print("caught") exit() except: res.append(np.nan) if len(res) == resn: res.append(np.nan) df['fastroc'] = res print(df.head) df.to_csv(args.o, sep=',', index=False) return 0
def __init__(self, dbname, cutoff, shapeOnly): self.cutoff = cutoff # set up and options and database based upon shapeOnly self.defaultOptions = oefastrocs.OEShapeDatabaseOptions() dbtype = oefastrocs.OEShapeDatabaseType_Default if shapeOnly: dbtype = oefastrocs.OEShapeDatabaseType_Shape self.defaultOptions.SetScoreType(dbtype) self.shapedb = oefastrocs.OEShapeDatabase(dbtype) self.dbmols = [] volumes = [] # read in database ifs = oechem.oemolistream() if not ifs.open(dbname): oechem.OEThrow.Fatal("Unable to open '%s'" % dbname) count = 0 for mol in ifs.GetOEGraphMols(): title = mol.GetTitle() if not title: title = "Untitled" + str(count) mol.SetTitle(title) count += 1 idx = self.shapedb.AddMol(oechem.OEMol(mol)) volume = oeshape.OEGetCachedSelfShape(mol) if volume == 0.0: volume = oeshape.OESelfShape(mol) volumes.append((volume, idx)) dbmol = oechem.OEGraphMol(mol, oechem.OEMolBaseType_OEDBMol) dbmol.Compress() self.dbmols.append(dbmol) numMols = len(volumes) # find the molecule with the median volume as our first query volumes.sort() medianVolume, medianIdx = volumes[numMols // 2] self.nextClusterHeadIdx = medianIdx self.remainingMolecules = numMols self.tanimotos = [0.0] * numMols self.scoreGetter = GetScoreGetter(shapeOnly)
def __init__(self, shapedb, querymolstr, nhits, iformat, oformat, errorLevel, **kwargs): """ Create a new thread to perform a query. The query doesn't execute until start is called. shapedb - database to run the query against See MCMolShapeDatabase.GetBestOverlays for a description of the querymolstr and nhits arguments. """ Thread.__init__(self) self.shapeOnly = kwargs.pop('shapeOnly', False) self.tversky = kwargs.pop('tversky', False) self.altStarts = kwargs.pop('altStarts', False) self.randStarts = kwargs.pop('randStarts', False) self.shapedb = shapedb self.querymolstr = querymolstr self.iformat = iformat self.oformat = oformat self.scoretype = GetDatabaseType(self.shapeOnly) self.simFuncType = GetSimFuncType(self.tversky) numHistBins = 200 if self.shapeOnly: numHistBins = 100 self.tracer = oefastrocs.OEDBTracer(numHistBins) self.options = oefastrocs.OEShapeDatabaseOptions() self.options.SetTracer(self.tracer) self.options.SetLimit(nhits) self.options.SetScoreType(self.scoretype) self.options.SetSimFunc(self.simFuncType) if self.altStarts: self.options.SetInitialOrientation(GetStartType(self.altStarts)) if self.randStarts: self.options.SetNumRandomStarts(self.randStarts) self.lock = Lock() self.errorLevel = errorLevel
def GetCluster(self, query): options = oefastrocs.OEShapeDatabaseOptions(self.defaultOptions) dots = oechem.OEDots(10000, 200, "molecules searched") minTani = sys.float_info.max minIdx = None for score in self.shapedb.GetScores(query, options): idx = score.GetMolIdx() # check if already in a cluster if self.dbmols[idx] is None: continue if self.cutoff < self.scoreGetter(score): yield self._removeMolecule(idx), score else: self.tanimotos[idx] = max(self.tanimotos[idx], self.scoreGetter(score)) minTani, minIdx = min((minTani, minIdx), (self.tanimotos[idx], idx)) dots.Update() dots.Total() self.nextClusterHeadIdx = minIdx
def main(argv=[__name__]): itf = oechem.OEInterface(InterfaceData, argv) ifs = oechem.oemolistream() if not ifs.open(itf.GetString("-dbase")): oechem.OEThrow.Fatal("Unable to open %s for reading" % itf.GetString("-dbase")) colname = "TanimotoCombo" getter = oefastrocs.OEShapeDatabaseScore.GetTanimotoCombo dbtype = oefastrocs.OEShapeDatabaseType_Default if itf.GetBool("-shapeOnly"): colname = "ShapeTanimoto" getter = oefastrocs.OEShapeDatabaseScore.GetShapeTanimoto dbtype = oefastrocs.OEShapeDatabaseType_Shape csvwriter = csv.writer(open(itf.GetString("-matrix"), 'w')) csvwriter.writerow(["Title1", "Title2", colname]) shapedb = oefastrocs.OEShapeDatabase(dbtype) options = oefastrocs.OEShapeDatabaseOptions() options.SetScoreType(dbtype) lmat = [[]] titles = [] for mol in ifs.GetOEMols(): if titles: bestscores = [0.0] * len(titles) for conf in mol.GetConfs(): for score in shapedb.GetScores(conf, options): midx = score.GetMolIdx() bestscores[midx] = max(bestscores[midx], getter(score)) lmat.append(bestscores) shapedb.AddMol(mol) title = mol.GetTitle() if not title: title = str(len(titles) + 1) titles.append(title) # write csv file csvwriter = csv.writer(open(itf.GetString("-matrix"), 'w')) csvwriter.writerow(titles) nrows = len(titles) for i in range(nrows): row = [i + 1] for j in range(nrows): val = 2.0 if itf.GetBool("-shapeOnly"): val = 1.0 if j > i: val -= lmat[j][i] elif j < i: val -= lmat[i][j] elif j == i: val = 0.0 row.append("%.3f" % val) csvwriter.writerow(row) return 0
def main(argv=[__name__]): parser = argparse.ArgumentParser() # positional arguments retaining backward compatibility parser.add_argument( 'database', help='File containing the database molecules to be search \ (format not restricted to *.oeb).') parser.add_argument( 'query', default=[], nargs='+', help='File containing the query molecule(s) to be search \ (format not restricted to *.oeb).') parser.add_argument( '--nHits', dest='nHits', type=int, default=100, help='Number of hits to return (default = number of database mols).') parser.add_argument('--cutoff', dest='cutoff', type=float, default=argparse.SUPPRESS, help='Specify a cutoff criteria for scores.') parser.add_argument( '--tversky', dest='tversky', action='store_true', default=argparse.SUPPRESS, help='Switch to Tversky similarity scoring (default = Tanimoto).') args = parser.parse_args() dbname = args.database if not oefastrocs.OEFastROCSIsGPUReady(): oechem.OEThrow.Info("No supported GPU available!") return 0 # set options opts = oefastrocs.OEShapeDatabaseOptions() opts.SetLimit(args.nHits) print("Number of hits set to %u" % opts.GetLimit()) if hasattr(args, 'cutoff') is not False: opts.SetCutoff(args.cutoff) print("Cutoff set to %f" % args.cutoff) if hasattr(args, 'tversky') is not False: opts.SetSimFunc(args.tversky) print("Tversky similarity scoring set.") # read in database ifs = oechem.oemolistream() if not ifs.open(dbname): oechem.OEThrow.Fatal("Unable to open '%s'" % dbname) print("\nOpening database file %s ..." % dbname) timer = oechem.OEWallTimer() dbase = oefastrocs.OEShapeDatabase() moldb = oechem.OEMolDatabase() if not moldb.Open(ifs): oechem.OEThrow.Fatal("Unable to open '%s'" % dbname) dots = oechem.OEThreadedDots(10000, 200, "conformers") if not dbase.Open(moldb, dots): oechem.OEThrow.Fatal("Unable to initialize OEShapeDatabase on '%s'" % dbname) dots.Total() print("%f seconds to load database\n" % timer.Elapsed()) for qfname in args.query: # read in query qfs = oechem.oemolistream() if not qfs.open(qfname): oechem.OEThrow.Fatal("Unable to open '%s'" % qfname) mcmol = oechem.OEMol() if not oechem.OEReadMolecule(qfs, mcmol): oechem.OEThrow.Fatal("Unable to read query from '%s'" % qfname) qfs.rewind() ext = oechem.OEGetFileExtension(qfname) qmolidx = 0 while oechem.OEReadMolecule(qfs, mcmol): # write out to file name based on molecule title ofs = oechem.oemolostream() moltitle = mcmol.GetTitle() if len(moltitle) == 0: moltitle = str(qmolidx) ofname = moltitle + "_results." + ext if not ofs.open(ofname): oechem.OEThrow.Fatal("Unable to open '%s'" % argv[4]) print("Searching for %s of %s (%s conformers)" % (moltitle, qfname, mcmol.NumConfs())) qconfidx = 0 for conf in mcmol.GetConfs(): for score in dbase.GetSortedScores(conf, opts): dbmol = oechem.OEMol() dbmolidx = score.GetMolIdx() if not moldb.GetMolecule(dbmol, dbmolidx): print( "Unable to retrieve molecule '%u' from the database" % dbmolidx) continue mol = oechem.OEGraphMol( dbmol.GetConf(oechem.OEHasConfIdx(score.GetConfIdx()))) oechem.OESetSDData(mol, "QueryConfidx", "%s" % qconfidx) oechem.OESetSDData(mol, "ShapeTanimoto", "%.4f" % score.GetShapeTanimoto()) oechem.OESetSDData(mol, "ColorTanimoto", "%.4f" % score.GetColorTanimoto()) oechem.OESetSDData(mol, "TanimotoCombo", "%.4f" % score.GetTanimotoCombo()) score.Transform(mol) oechem.OEWriteMolecule(ofs, mol) qconfidx += 1 print("%s conformers processed" % qconfidx) print("Wrote results to %s\n" % ofname) qmolidx += 1 return 0
def main(argv=[__name__]): if len(argv) < 3: oechem.OEThrow.Usage("%s <database> [<queries> ... ]" % argv[0]) return 0 # check system if not oefastrocs.OEFastROCSIsGPUReady(): oechem.OEThrow.Info("No supported GPU available!") return 0 # read in database dbname = argv[1] print("Opening database file %s ..." % dbname) dbase = oefastrocs.OEShapeDatabase() moldb = oechem.OEMolDatabase() if not moldb.Open(dbname): oechem.OEThrow.Fatal("Unable to open '%s'" % dbname) dots = oechem.OEThreadedDots(10000, 200, "conformers") if not dbase.Open(moldb, dots): oechem.OEThrow.Fatal("Unable to initialize OEShapeDatabase on '%s'" % dbname) # customize search options opts = oefastrocs.OEShapeDatabaseOptions() opts.SetLimit(5) for qfname in argv[2:]: # read in query qfs = oechem.oemolistream() if not qfs.open(qfname): oechem.OEThrow.Fatal("Unable to open '%s'" % qfname) query = oechem.OEGraphMol() if not oechem.OEReadMolecule(qfs, query): oechem.OEThrow.Fatal("Unable to read query from '%s'" % qfname) ext = oechem.OEGetFileExtension(qfname) base = qfname[:-(len(ext) + 1)] # write out everthing to a similary named file ofs = oechem.oemolostream() ofname = base + "_results." + ext if not ofs.open(ofname): oechem.OEThrow.Fatal("Unable to open '%s'" % argv[4]) oechem.OEWriteMolecule(ofs, query) print("Searching for %s" % qfname) for score in dbase.GetSortedScores(query, opts): print("Score for mol %u(conf %u) %f shape %f color" % (score.GetMolIdx(), score.GetConfIdx(), score.GetShapeTanimoto(), score.GetColorTanimoto())) dbmol = oechem.OEMol() molidx = score.GetMolIdx() if not moldb.GetMolecule(dbmol, molidx): print("Unable to retrieve molecule '%u' from the database" % molidx) continue mol = oechem.OEGraphMol( dbmol.GetConf(oechem.OEHasConfIdx(score.GetConfIdx()))) oechem.OESetSDData(mol, "ShapeTanimoto", "%.4f" % score.GetShapeTanimoto()) oechem.OESetSDData(mol, "ColorTanimoto", "%.4f" % score.GetColorTanimoto()) oechem.OESetSDData(mol, "TanimotoCombo", "%.4f" % score.GetTanimotoCombo()) score.Transform(mol) oechem.OEWriteMolecule(ofs, mol) print("Wrote results to %s" % ofname) return 0
def main(argv=[__name__]): if len(argv) < 3: oechem.OEThrow.Usage("%s <database> [<queries> ... ]" % argv[0]) return 0 # check system if not oefastrocs.OEFastROCSIsGPUReady(): oechem.OEThrow.Info("No supported GPU available!") return 0 # read in database dbname = argv[1] print("Opening database file %s ..." % dbname) dbase = oefastrocs.OEShapeDatabase() moldb = oechem.OEMolDatabase() if not moldb.Open(dbname): oechem.OEThrow.Fatal("Unable to open '%s'" % dbname) dots = oechem.OEThreadedDots(10000, 200, "conformers") if not dbase.Open(moldb, dots): oechem.OEThrow.Fatal("Unable to initialize OEShapeDatabase on '%s'" % dbname) # customize search options opts = oefastrocs.OEShapeDatabaseOptions() opts.SetInitialOrientation( oefastrocs.OEFastROCSOrientation_UserInertialStarts) opts.SetLimit(5) for qfname in argv[2:]: # read in query qfs = oechem.oemolistream() if not qfs.open(qfname): oechem.OEThrow.Fatal("Unable to open '%s'" % qfname) query = oechem.OEGraphMol() if not oechem.OEReadMolecule(qfs, query): oechem.OEThrow.Fatal("Unable to read query from '%s'" % qfname) ext = oechem.OEGetFileExtension(qfname) base = qfname[:-(len(ext) + 1)] # write out everthing to a similary named file ofs = oechem.oemolostream() ofname = base + "_user_results." + ext if not ofs.open(ofname): oechem.OEThrow.Fatal("Unable to open '%s'" % argv[4]) oechem.OEWriteMolecule(ofs, query) startsCoords = oechem.OEFloatVector() atomIdx = 1 xyz = query.GetCoords()[atomIdx] for x in xyz: startsCoords.append(x) if not len(startsCoords) % 3 == 0: oechem.OEThrow.Fatal( "Something went wrong whilst reading in user-starts coordinates" ) opts.SetUserStarts(oechem.OEFloatVector(startsCoords), int(len(startsCoords) / 3)) opts.SetMaxOverlays(opts.GetNumInertialStarts() * opts.GetNumUserStarts()) if opts.GetInitialOrientation( ) == oefastrocs.OEFastROCSOrientation_UserInertialStarts: numStarts = opts.GetNumUserStarts() print("This example will use %u starts" % numStarts) print("Searching for %s" % qfname) for score in dbase.GetSortedScores(query, opts): print("Score for mol %u(conf %u) %f shape %f color" % (score.GetMolIdx(), score.GetConfIdx(), score.GetShapeTanimoto(), score.GetColorTanimoto())) dbmol = oechem.OEMol() molidx = score.GetMolIdx() if not moldb.GetMolecule(dbmol, molidx): print("Unable to retrieve molecule '%u' from the database" % molidx) continue mol = oechem.OEGraphMol( dbmol.GetConf(oechem.OEHasConfIdx(score.GetConfIdx()))) oechem.OESetSDData(mol, "ShapeTanimoto", "%.4f" % score.GetShapeTanimoto()) oechem.OESetSDData(mol, "ColorTanimoto", "%.4f" % score.GetColorTanimoto()) oechem.OESetSDData(mol, "TanimotoCombo", "%.4f" % score.GetTanimotoCombo()) score.Transform(mol) oechem.OEWriteMolecule(ofs, mol) print("Wrote results to %s" % ofname) return 0
def main(argv=[__name__]): if len(argv) < 3: oechem.OEThrow.Usage("%s <database> [<queries> ... ]" % argv[0]) dbname = argv[1] # read in database ifs = oechem.oemolistream() if not ifs.open(dbname): oechem.OEThrow.Fatal("Unable to open '%s'" % dbname) print("Opening database file %s ..." % dbname) timer = oechem.OEWallTimer() dbase = oefastrocs.OEShapeDatabase() moldb = oechem.OEMolDatabase() if not moldb.Open(ifs): oechem.OEThrow.Fatal("Unable to open '%s'" % dbname) dots = oechem.OEThreadedDots(10000, 200, "conformers") if not dbase.Open(moldb, dots): oechem.OEThrow.Fatal("Unable to initialize OEShapeDatabase on '%s'" % dbname) dots.Total() print("%s seconds to load database" % timer.Elapsed()) opts = oefastrocs.OEShapeDatabaseOptions() opts.SetSimFunc(oefastrocs.OEShapeSimFuncType_Tversky) numHits = moldb.NumMols() opts.SetLimit(numHits) for qfname in argv[2:]: # read in query qfs = oechem.oemolistream() if not qfs.open(qfname): oechem.OEThrow.Fatal("Unable to open '%s'" % argv[1]) query = oechem.OEGraphMol() if not oechem.OEReadMolecule(qfs, query): oechem.OEThrow.Fatal("Unable to read query from '%s'" % argv[1]) ext = oechem.OEGetFileExtension(qfname) base = qfname[:-(len(ext) + 1)] # write out everthing to a similary named file ofs = oechem.oemolostream() ofname = base + "_results." + ext if not ofs.open(ofname): oechem.OEThrow.Fatal("Unable to open '%s'" % argv[4]) print("Searching for %s" % qfname) for score in dbase.GetSortedScores(query, opts): dbmol = oechem.OEMol() molidx = score.GetMolIdx() if not moldb.GetMolecule(dbmol, molidx): print("Unable to retrieve molecule '%u' from the database" % molidx) continue mol = oechem.OEGraphMol( dbmol.GetConf(oechem.OEHasConfIdx(score.GetConfIdx()))) oechem.OESetSDData(mol, "ShapeTversky", "%.4f" % score.GetShapeTversky()) oechem.OESetSDData(mol, "ColorTversky", "%.4f" % score.GetColorTversky()) oechem.OESetSDData(mol, "TverskyCombo", "%.4f" % score.GetTverskyCombo()) score.Transform(mol) oechem.OEWriteMolecule(ofs, mol) print("Wrote results to %s" % ofname) return 0