def GetDataSetInfo(self, **kwargs): """ Returns a MLDataSet pulled from a database using our stored values. """ conn = DbConnect(self.dbName, self.tableName) res = conn.GetColumnNamesAndTypes(join=self.dbJoin, what=self.dbWhat, where=self.dbWhere) return res
def _confirm(self, tblName): conn = DbConnect(self.dbName, tblName) res = conn.GetColumnNamesAndTypes() assert len(res) == len(self.colHeads), 'bad number of columns' names = [x[0] for x in res] for i in range(len(names)): assert names[i].upper() == self.colHeads[i].upper( ), 'bad column head' if RDConfig.useSqlLite: # doesn't seem to be any column type info available return types = [x[1] for x in res] for i in range(len(types)): assert types[i] == self.colTypes[i], 'bad column type'
def RunSearch(options, queryFilename): global sigFactory if options.similarityType == 'AtomPairs': fpBuilder = FingerprintUtils.BuildAtomPairFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.pairDbName) fpTableName = options.pairTableName fpColName = options.pairColName elif options.similarityType == 'TopologicalTorsions': fpBuilder = FingerprintUtils.BuildTorsionsFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.torsionsDbName) fpTableName = options.torsionsTableName fpColName = options.torsionsColName elif options.similarityType == 'RDK': fpBuilder = FingerprintUtils.BuildRDKitFP simMetric = DataStructs.FingerprintSimilarity dbName = os.path.join(options.dbDir, options.fpDbName) fpTableName = options.fpTableName if not options.fpColName: options.fpColName = 'rdkfp' fpColName = options.fpColName elif options.similarityType == 'Pharm2D': fpBuilder = FingerprintUtils.BuildPharm2DFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.fpDbName) fpTableName = options.pharm2DTableName if not options.fpColName: options.fpColName = 'pharm2dfp' fpColName = options.fpColName FingerprintUtils.sigFactory = BuildSigFactory(options) elif options.similarityType == 'Gobbi2D': from rdkit.Chem.Pharm2D import Gobbi_Pharm2D fpBuilder = FingerprintUtils.BuildPharm2DFP simMetric = DataStructs.TanimotoSimilarity dbName = os.path.join(options.dbDir, options.fpDbName) fpTableName = options.gobbi2DTableName if not options.fpColName: options.fpColName = 'gobbi2dfp' fpColName = options.fpColName FingerprintUtils.sigFactory = Gobbi_Pharm2D.factory elif options.similarityType == 'Morgan': fpBuilder = FingerprintUtils.BuildMorganFP simMetric = DataStructs.DiceSimilarity dbName = os.path.join(options.dbDir, options.morganFpDbName) fpTableName = options.morganFpTableName fpColName = options.morganFpColName extraArgs = {} if options.similarityMetric == 'tanimoto': simMetric = DataStructs.TanimotoSimilarity elif options.similarityMetric == 'dice': simMetric = DataStructs.DiceSimilarity elif options.similarityMetric == 'tversky': simMetric = DataStructs.TverskySimilarity extraArgs['tverskyA'] = options.tverskyA extraArgs['tverskyB'] = options.tverskyB if options.smilesQuery: mol = Chem.MolFromSmiles(options.smilesQuery) if not mol: logger.error('could not build query molecule from smiles "%s"' % options.smilesQuery) sys.exit(-1) options.queryMol = mol elif options.smartsQuery: mol = Chem.MolFromSmarts(options.smartsQuery) if not mol: logger.error('could not build query molecule from smarts "%s"' % options.smartsQuery) sys.exit(-1) options.queryMol = mol if options.outF == '-': outF = sys.stdout elif options.outF == '': outF = None else: outF = open(options.outF, 'w+') molsOut = False if options.sdfOut: molsOut = True if options.sdfOut == '-': sdfOut = sys.stdout else: sdfOut = open(options.sdfOut, 'w+') else: sdfOut = None if options.smilesOut: molsOut = True if options.smilesOut == '-': smilesOut = sys.stdout else: smilesOut = open(options.smilesOut, 'w+') else: smilesOut = None if queryFilename: try: tmpF = open(queryFilename, 'r') except IOError: logger.error('could not open query file %s' % queryFilename) sys.exit(1) if options.molFormat == 'smiles': func = GetMolsFromSmilesFile elif options.molFormat == 'sdf': func = GetMolsFromSDFile if not options.silent: msg = 'Reading query molecules' if fpBuilder: msg += ' and generating fingerprints' logger.info(msg) probes = [] i = 0 nms = [] for nm, smi, mol in func(queryFilename, None, options.nameProp): i += 1 nms.append(nm) if not mol: logger.error('query molecule %d could not be built' % (i)) probes.append((None, None)) continue if fpBuilder: probes.append((mol, fpBuilder(mol))) else: probes.append((mol, None)) if not options.silent and not i % 1000: logger.info(" done %d" % i) else: probes = None conn = None idName = options.molIdName ids = None names = None molDbName = os.path.join(options.dbDir, options.molDbName) molIdName = options.molIdName mConn = DbConnect(molDbName) cns = [(x.lower(), y) for x, y in mConn.GetColumnNamesAndTypes('molecules')] idCol, idTyp = cns[0] if options.propQuery or options.queryMol: conn = DbConnect(molDbName) curs = conn.GetCursor() if options.queryMol: if not options.silent: logger.info('Doing substructure query') if options.propQuery: where = 'where %s' % options.propQuery else: where = '' if not options.silent: curs.execute('select count(*) from molecules %(where)s' % locals()) nToDo = curs.fetchone()[0] join = '' doSubstructFPs = False fpDbName = os.path.join(options.dbDir, options.fpDbName) if os.path.exists(fpDbName) and not options.negateQuery: curs.execute("attach database '%s' as fpdb" % (fpDbName)) try: curs.execute('select * from fpdb.%s limit 1' % options.layeredTableName) except: pass else: doSubstructFPs = True join = 'join fpdb.%s using (%s)' % ( options.layeredTableName, idCol) query = LayeredOptions.GetQueryText(options.queryMol) if query: if not where: where = 'where' else: where += ' and' where += ' ' + query cmd = 'select %(idCol)s,molpkl from molecules %(join)s %(where)s' % locals( ) curs.execute(cmd) row = curs.fetchone() nDone = 0 ids = [] while row: id, molpkl = row if not options.zipMols: m = _molFromPkl(molpkl) else: m = Chem.Mol(zlib.decompress(molpkl)) matched = m.HasSubstructMatch(options.queryMol) if options.negateQuery: matched = not matched if matched: ids.append(id) nDone += 1 if not options.silent and not nDone % 500: if not doSubstructFPs: logger.info( ' searched %d (of %d) molecules; %d hits so far' % (nDone, nToDo, len(ids))) else: logger.info( ' searched through %d molecules; %d hits so far' % (nDone, len(ids))) row = curs.fetchone() if not options.silent and doSubstructFPs and nToDo: nFiltered = nToDo - nDone logger.info( ' Fingerprint screenout rate: %d of %d (%%%.2f)' % (nFiltered, nToDo, 100. * nFiltered / nToDo)) elif options.propQuery: if not options.silent: logger.info('Doing property query') propQuery = options.propQuery.split(';')[0] curs.execute( 'select %(idCol)s from molecules where %(propQuery)s' % locals()) ids = [x[0] for x in curs.fetchall()] if not options.silent: logger.info('Found %d molecules matching the query' % (len(ids))) t1 = time.time() if probes: if not options.silent: logger.info('Finding Neighbors') conn = DbConnect(dbName) cns = conn.GetColumnNames(fpTableName) curs = conn.GetCursor() if ids: ids = [(x, ) for x in ids] curs.execute( 'create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' % locals()) curs.executemany('insert into _tmpTbl values (?)', ids) join = 'join _tmpTbl using (%(idCol)s)' % locals() else: join = '' if cns[0].lower() != idCol.lower(): # backwards compatibility to the days when mol tables had a guid and # the fps tables did not: curs.execute("attach database '%(molDbName)s' as mols" % locals()) curs.execute(""" select %(idCol)s,%(fpColName)s from %(fpTableName)s join (select %(idCol)s,%(molIdName)s from mols.molecules %(join)s) using (%(molIdName)s) """ % (locals())) else: curs.execute( 'select %(idCol)s,%(fpColName)s from %(fpTableName)s %(join)s' % locals()) def poolFromCurs(curs, similarityMethod): row = curs.fetchone() while row: id, pkl = row fp = DepickleFP(pkl, similarityMethod) yield (id, fp) row = curs.fetchone() topNLists = GetNeighborLists(probes, options.topN, poolFromCurs(curs, options.similarityType), simMetric=simMetric, simThresh=options.simThresh, **extraArgs) uniqIds = set() nbrLists = {} for i, nm in enumerate(nms): topNLists[i].reverse() scores = topNLists[i].GetPts() nbrNames = topNLists[i].GetExtras() nbrs = [] for j, nbrGuid in enumerate(nbrNames): if nbrGuid is None: break else: uniqIds.add(nbrGuid) nbrs.append((nbrGuid, scores[j])) nbrLists[(i, nm)] = nbrs t2 = time.time() if not options.silent: logger.info('The search took %.1f seconds' % (t2 - t1)) if not options.silent: logger.info('Creating output') curs = mConn.GetCursor() ids = list(uniqIds) ids = [(x, ) for x in ids] curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' % locals()) curs.executemany('insert into _tmpTbl values (?)', ids) curs.execute( 'select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)' % locals()) nmDict = {} for guid, id in curs.fetchall(): nmDict[guid] = str(id) ks = list(nbrLists.keys()) ks.sort() if not options.transpose: for i, nm in ks: nbrs = nbrLists[(i, nm)] nbrTxt = options.outputDelim.join([nm] + [ '%s%s%.3f' % (nmDict[id], options.outputDelim, score) for id, score in nbrs ]) if outF: print(nbrTxt, file=outF) else: labels = [ '%s%sSimilarity' % (x[1], options.outputDelim) for x in ks ] if outF: print(options.outputDelim.join(labels), file=outF) for i in range(options.topN): outL = [] for idx, nm in ks: nbr = nbrLists[(idx, nm)][i] outL.append(nmDict[nbr[0]]) outL.append('%.3f' % nbr[1]) if outF: print(options.outputDelim.join(outL), file=outF) else: if not options.silent: logger.info('Creating output') curs = mConn.GetCursor() ids = [(x, ) for x in set(ids)] curs.execute('create temporary table _tmpTbl (%(idCol)s %(idTyp)s)' % locals()) curs.executemany('insert into _tmpTbl values (?)', ids) molIdName = options.molIdName curs.execute( 'select %(idCol)s,%(molIdName)s from molecules join _tmpTbl using (%(idCol)s)' % locals()) nmDict = {} for guid, id in curs.fetchall(): nmDict[guid] = str(id) if outF: print('\n'.join(nmDict.values()), file=outF) if molsOut and ids: molDbName = os.path.join(options.dbDir, options.molDbName) cns = [x.lower() for x in mConn.GetColumnNames('molecules')] if cns[-1] != 'molpkl': cns.remove('molpkl') cns.append('molpkl') curs = mConn.GetCursor() #curs.execute('create temporary table _tmpTbl (guid integer)'%locals()) #curs.executemany('insert into _tmpTbl values (?)',ids) cnText = ','.join(cns) curs.execute( 'select %(cnText)s from molecules join _tmpTbl using (%(idCol)s)' % locals()) row = curs.fetchone() molD = {} while row: row = list(row) m = _molFromPkl(row[-1]) guid = row[0] nm = nmDict[guid] if sdfOut: m.SetProp('_Name', nm) print(Chem.MolToMolBlock(m), file=sdfOut) for i in range(1, len(cns) - 1): pn = cns[i] pv = str(row[i]) print >> sdfOut, '> <%s>\n%s\n' % (pn, pv) print('$$$$', file=sdfOut) if smilesOut: smi = Chem.MolToSmiles(m, options.chiralSmiles) if smilesOut: print('%s %s' % (smi, str(row[1])), file=smilesOut) row = curs.fetchone() if not options.silent: logger.info('Done!')