def similarity(self, smiles1, smiles2): if not smiles1 or not smiles2: return None smiles1 = re.sub(r'\[\*:[0-9]*\]', "[H]", smiles1) smiles2 = re.sub(r'\[\*:[0-9]*\]', "[H]", smiles2) mol1 = Chem.MolFromSmiles(smiles1) mol2 = Chem.MolFromSmiles(smiles2) fp1 = Chem.RDKFingerprint(mol1) fp2 = Chem.RDKFingerprint(mol2) return { "Tanimoto": DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.TanimotoSimilarity), "Dice": DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.DiceSimilarity), "Cosine": DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.CosineSimilarity), "Sokal": DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.SokalSimilarity), "Russel": DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.RusselSimilarity), "Kulczynski": DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.KulczynskiSimilarity), "McConnaughey": DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.McConnaugheySimilarity), }
def test5Dice(self): """ """ v1 = ds.IntSparseIntVect(5) v1[4] = 4 v1[0] = 2 v1[3] = 1 self.assertTrue(feq(ds.DiceSimilarity(v1, v1), 1.0)) v1 = ds.IntSparseIntVect(5) v1[0] = 2 v1[2] = 1 v1[3] = 4 v1[4] = 6 v2 = ds.IntSparseIntVect(5) v2[1] = 2 v2[2] = 3 v2[3] = 4 v2[4] = 4 self.assertTrue(feq(ds.DiceSimilarity(v1, v2), 18.0 / 26.)) self.assertTrue(feq(ds.DiceSimilarity(v2, v1), 18.0 / 26.))
def get_fitness(genes, target): ms = [Chem.MolFromSmiles(target), Chem.MolFromSmiles(genes)] fps = [FingerprintMols.FingerprintMol(x) for x in ms] return DataStructs.FingerprintSimilarity(fps[0], fps[1]), None
def eval_similarity(fp_list, dim, evaluator): s_list = [] for i in range(len(fp_list) - 1): for j in range(i + 1, len(fp_list)): s_list.append(evaluator(fp_list[i][dim], fp_list[j][dim])) s_list = np.array(s_list) return np.mean(s_list), np.std(s_list) if __name__ == '__main__': f = sys.argv[1] fp_func_list = [ lambda x: AllChem.GetMorganFingerprint(x, 2), lambda x: MACCSkeys.GenMACCSKeys(x), lambda x: Pairs.GetAtomPairFingerprint(x), lambda x: FingerprintMols.FingerprintMol(x) ] evaluators = [ lambda x, y: DataStructs.DiceSimilarity(x, y), lambda x, y: DataStructs.FingerprintSimilarity(x, y), lambda x, y: DataStructs.DiceSimilarity(x, y), lambda x, y: DataStructs.FingerprintSimilarity(x, y) ] fp_list = get_fp_list(f, fp_func_list) for i in range(len(fp_func_list)): m, s = eval_similarity(fp_list, i, evaluators[i]) print(1 - m, s)
try: a=[rec[1][0],make_fingerprint(Chem.MolFromInchi(rec[1][9]))] fps_lincs.append(a) print(rec[1][0]) except Exception: continue #%% calculate similarity matches=[] for rec_2 in enumerate(fps_list2):#rec_q in enumerate(fps_lincs): best_similarity=0 best_names=[] for rec_1 in enumerate(fps_list1):#rec_db in enumerate(fps_dundee): similarity=DataStructs.FingerprintSimilarity( rec_2[1][1], rec_1[1][1], DataStructs.TanimotoSimilarity ) #print(qi,di) if similarity > best_similarity: best_similarity = similarity best_names =[rec_1[1][0]] elif similarity==best_similarity: best_names.append([rec_1[1][0]]) print(rec_2[0]) for m in best_names: matches.append([rec_2[1][0],m,best_similarity]) #%% matches_df=pd.DataFrame(matches)
for filename in glob.glob('models/*.pkl'): querynn = [] pool = [] cur = conn.cursor() cur.execute("SELECT stdsmiles FROM actives WHERE UNIPROT = '" + filename[7:-4] + "';") for row in cur: pool.append(calcNormalFingerprints(row[0])) with open(filename, 'rb') as fid: firstcols.append([u_name[filename[7:-4]], filename[7:-4]]) bnb = cPickle.load(fid) #for each wombat compound get nn ncount = int(round(bnb.class_count_[1] * 0.1)) sim_array = [] for f in fp: sim_array.append(DataStructs.BulkTanimotoSimilarity(f, pool)) sims = [] for sim in sim_array: sims.append(np.average(np.sort(sim)[-ncount:])) matrix.append(sims) count += 1 #update precent finished percent = (float(count) / float(t_count)) * 100 sys.stdout.write(' Performing NN search on Query Molecule: %3d%%\r' % percent) sys.stdout.flush() matrix = np.concatenate((firstcols, matrix), axis=1) headings = ['Name', 'Uniprot'] for i in range(len(fp)): headings.append("C" + str(i + 1)) file.write('\t'.join(headings))
def partialSimilarity(atomID): """ Determine similarity for the atoms set by atomID """ # create empty fp modifiedFP = DataStructs.ExplicitBitVect(1024) modifiedFP.SetBitsFromList(aBits[atomID]) return DataStructs.TverskySimilarity(subsFp, modifiedFP, 0, 1)
for i, mol in enumerate(suppl): if not mol: continue smi = Chem.MolToSmiles(mol, True) nm = mol.GetProp(nameField) property = float(mol.GetProp(propField)) fp = GetMolFingerprint(mol, maxPathLength) data.append((nm, smi, property, fp)) logger.info(' got %d molecules' % len(data)) logger.info('calculating pairs') pairs = [] for i in range(len(data)): for j in range(i + 1, len(data)): if DataStructs.DiceSimilarity(data[i][-1], data[j][-1]) > similarityThreshold: pairs.append((i, j)) if not (i + 1) % 100: logger.info('Done %d molecules' % (i + 1)) logger.info(' got %d reasonable pairs' % len(pairs)) logger.info('creating output file') print >> outF, 'nameA|nameB|nameAB|smilesA|smilesB|smilesAB|actA|actB|dAct|dist|disparity' for i, j in pairs: if data[i][2] < data[j][2]: i, j = j, i nmi, smii, propi, fpi = data[i] nmj, smij, propj, fpj = data[j] dAct = propi - propj dist = 1. - DataStructs.DiceSimilarity(fpi, fpj)
def test10BulkOps3(self): nbits = 10000 bvs = numpy.empty((10, ), DataStructs.ExplicitBitVect) for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs[bvi] = bv sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.failUnless(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.failUnless(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i]))
def __compute_diversity(mol, fps): ref_fps = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048) dist = DataStructs.BulkTanimotoSimilarity(ref_fps, fps, returnDistance=True) score = np.mean(dist) return score
def test7MultiFPBReaderContains(self): basen = os.path.join(RDConfig.RDBaseDir, 'Code', 'DataStructs', 'testData') mfpbr = DataStructs.MultiFPBReader() self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.1.patt.fpb"))), 1) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.2.patt.fpb"))), 2) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.3.patt.fpb"))), 3) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.4.patt.fpb"))), 4) mfpbr.Init() self.assertEqual(mfpbr.GetNumBits(), 1024) self.assertEqual(len(mfpbr), 4) fps = "40081010824820021000500010110410003000402b20285000a4040240010030050000"+\ "080001420040009000003d04086007080c03b31d920004220400074008098010206080"+\ "00488001080000c64002a00080000200024c2000602410049200340820200002400010"+\ "02200106090401056801080182006088101000088a0048" ebv = DataStructs.CreateFromFPSText(fps) bytes = DataStructs.BitVectToBinaryText(ebv) nbrs = mfpbr.GetContainingNeighbors(bytes) self.assertEqual(len(nbrs), 9) self.assertEqual(nbrs[0][0], 160) self.assertEqual(nbrs[0][1], 0) self.assertEqual(nbrs[1][0], 163) self.assertEqual(nbrs[1][1], 0) self.assertEqual(nbrs[2][0], 170) self.assertEqual(nbrs[2][1], 0) self.assertEqual(nbrs[3][0], 180) self.assertEqual(nbrs[3][1], 2) self.assertEqual(nbrs[4][0], 182) self.assertEqual(nbrs[4][1], 3) self.assertEqual(nbrs[5][0], 185) self.assertEqual(nbrs[5][1], 0) self.assertEqual(nbrs[6][0], 189) self.assertEqual(nbrs[6][1], 0) self.assertEqual(nbrs[7][0], 192) self.assertEqual(nbrs[7][1], 3) self.assertEqual(nbrs[8][0], 193) self.assertEqual(nbrs[8][1], 0) nbrs = mfpbr.GetContainingNeighbors(bytes, numThreads=4) self.assertEqual(len(nbrs), 9) self.assertEqual(nbrs[0][0], 160) self.assertEqual(nbrs[0][1], 0) self.assertEqual(nbrs[1][0], 163) self.assertEqual(nbrs[1][1], 0) self.assertEqual(nbrs[2][0], 170) self.assertEqual(nbrs[2][1], 0) self.assertEqual(nbrs[3][0], 180) self.assertEqual(nbrs[3][1], 2) self.assertEqual(nbrs[4][0], 182) self.assertEqual(nbrs[4][1], 3) self.assertEqual(nbrs[5][0], 185) self.assertEqual(nbrs[5][1], 0) self.assertEqual(nbrs[6][0], 189) self.assertEqual(nbrs[6][1], 0) self.assertEqual(nbrs[7][0], 192) self.assertEqual(nbrs[7][1], 3) self.assertEqual(nbrs[8][0], 193) self.assertEqual(nbrs[8][1], 0)
def get_prep_value(self, value): # convert the ExplicitBitVect instance to the value used by the # db driver if isinstance(value, ExplicitBitVect): value = six.memoryview(DataStructs.BitVectToBinaryText(value)) return value
def optimize(self, smiles, sim_cutoff, lr=2.0, num_iter=20): mol_tree = MolTree(smiles) mol_tree.recover() _, tree_vec, mol_vec = self.encode([mol_tree]) mol = Chem.MolFromSmiles(smiles) fp1 = AllChem.GetMorganFingerprint(mol, 2) tree_mean = self.T_mean(tree_vec) tree_log_var = -torch.abs( self.T_var(tree_vec)) #Following Mueller et al. mol_mean = self.G_mean(mol_vec) mol_log_var = -torch.abs( self.G_var(mol_vec)) #Following Mueller et al. mean = torch.cat([tree_mean, mol_mean], dim=1) log_var = torch.cat([tree_log_var, mol_log_var], dim=1) cur_vec = create_var(mean.data, True) visited = [] for step in range(num_iter): prop_val = self.propNN(cur_vec).squeeze() grad = torch.autograd.grad(prop_val, cur_vec)[0] cur_vec = cur_vec.data + lr * grad.data cur_vec = create_var(cur_vec, True) visited.append(cur_vec) l, r = 0, num_iter - 1 while l < r - 1: mid = (l + r) / 2 new_vec = visited[mid] tree_vec, mol_vec = torch.chunk(new_vec, 2, dim=1) new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False) if new_smiles is None: r = mid - 1 continue new_mol = Chem.MolFromSmiles(new_smiles) fp2 = AllChem.GetMorganFingerprint(new_mol, 2) sim = DataStructs.TanimotoSimilarity(fp1, fp2) if sim < sim_cutoff: r = mid - 1 else: l = mid """ best_vec = visited[0] for new_vec in visited: tree_vec,mol_vec = torch.chunk(new_vec, 2, dim=1) new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False) if new_smiles is None: continue new_mol = Chem.MolFromSmiles(new_smiles) fp2 = AllChem.GetMorganFingerprint(new_mol, 2) sim = DataStructs.TanimotoSimilarity(fp1, fp2) if sim >= sim_cutoff: best_vec = new_vec """ tree_vec, mol_vec = torch.chunk(visited[l], 2, dim=1) #tree_vec,mol_vec = torch.chunk(best_vec, 2, dim=1) new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False) if new_smiles is None: return smiles, 1.0 new_mol = Chem.MolFromSmiles(new_smiles) fp2 = AllChem.GetMorganFingerprint(new_mol, 2) sim = DataStructs.TanimotoSimilarity(fp1, fp2) if sim >= sim_cutoff: return new_smiles, sim else: return smiles, 1.0
def from_db_value(self, value, expression, connection, context): if value is None: return value return DataStructs.CreateFromBinaryText(bytes(value))
# sort canonical SMILES by similarity print("\n".join(f7(canonical_SMILES)), end="") import sys sys.exit() canonical_SMILES = sorted(set(canonical_SMILES)) # remove duplicates N = len(canonical_SMILES) matrice = [[0 for __ in range(N)] for _ in range(N)] fingerprints = [ FingerprintMols.FingerprintMol(Chem.MolFromSmiles(s)) for s in canonical_SMILES ] for i in range(N): for j in range(N): if i == j: matrice[i][j] = -1 else: matrice[i][j] = DataStructs.FingerprintSimilarity( fingerprints[i], fingerprints[j]) result = [] current = N - 1 # start with a last one, for no reason for i in range(N): result.append(canonical_SMILES[current]) next_index = matrice[current].index(max(matrice[current])) for j in range(N): matrice[j][current] = -1 current = next_index print("\n".join(result), end="")
def ClusterOnFingerprint(filename, mols=None, fingerprint=0, cutoff=0.8, metric='Tanimoto', outMatrix=False): '''Clustering Structure based on Fingerprints in RDKit filename: Smile format file saving molecules. If set to None, use given "mols" mols: Input molecules. No use if set up "filename" cutoff: Cutoff using for Butina Clustering fingerprint: Fingerprint to use: 0 or else: RDKit Topological Fingerprint 1: MACCS Fingerprint 2: Atom Pair Fingerprint (AP) 3: Topological Torsion Fingerprint (TT) 4: Morgan Fingerprint similar to ECFP4 Fingerprint 5: Morgan Fingerprint similar to FCFP4 Fingerprint metric: Available similarity metrics include: Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky. outMatrix: Change output to a similarity matrix Return: Default output "clusters, clusterOut": clusters: Clusters containing molecule number. clusterOut: Molecular Cluster Number in List. ''' from rdkit import DataStructs from rdkit.Chem.Draw import SimilarityMaps from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs, Torsions if filename: suppl = Chem.SmilesMolSupplier(filename) mols=[] for mol in suppl: mols.append(mol) molnums=len(mols) ### Calculate Molecular Fingerprint ## MACCS Fingerprint if fingerprint==1: fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols] ## Atom Pair Fingerprint (AP) elif fingerprint == 2: fps = [Pairs.GetAtomPairFingerprint(mol) for mol in mols] ## Topological Torsion Fingerprint (TT) elif fingerprint == 3: fps = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) for mol in mols] ## Morgan Fingerprint similar to ECFP4 Fingerprint elif fingerprint == 4: fps = [AllChem.GetMorganFingerprint(mol,2) for mol in mols] ## Morgan Fingerprint similar to FCFP4 Fingerprint elif fingerprint == 5: fps = [AllChem.GetMorganFingerprint(mol,2,useFeatures=True) for mol in mols] ## RDKit Topological Fingerprint else: #fingerprint==0: fps = [FingerprintMols.FingerprintMol(mol) for mol in mols] if outMatrix: ### Output the Fingerprint similarity Matrix metricsAvailable={'tanimoto':DataStructs.TanimotoSimilarity,"dice":DataStructs.DiceSimilarity, "cosine": DataStructs.CosineSimilarity, "sokal": DataStructs.SokalSimilarity, "russel": DataStructs.RusselSimilarity, "rogotGoldberg": DataStructs.RogotGoldbergSimilarity, "allbit": DataStructs.AllBitSimilarity, "kulczynski": DataStructs.KulczynskiSimilarity, "mcconnaughey": DataStructs.McConnaugheySimilarity, "asymmetric": DataStructs.AsymmetricSimilarity, "braunblanquet": DataStructs.BraunBlanquetSimilarity} if metric.lower() not in metricsAvailable: print "The given metric is unknown!" metric='Tanimoto' simMetrics=metricsAvailable[metric.lower()] ### Calculate Fingerprint similarity Matrix simdm=[[0.0]*molnums]*molnums for i in range(molnums): simdm[i,i]=1.0 for j in range(i+1,molnums): simdm[i,j]=DataStructs.FingerprintSimilarity(fps[i],fps[j],metric=simMetrics) simdm[j,i]=DataStructs.FingerprintSimilarity(fps[j],fps[i],metric=simMetrics) for i in range(molnums): print for j in range(molnums): print '%3.2f' % simdm[i,j], return simdm else: clusters=ClusterFps(fps, cutoff=1-cutoff, metric='Tanimoto') clusterID=0 clusterOut=[0]*len(mols) for cluster in clusters: clusterID+=1 for idx in cluster: clusterOut[idx]=clusterID ## To depict cluster molecule if False: if len(cluster)>1: print "Cluster: " for idx in cluster: mol2mpl(mols[idx]) return clusters, clusterOut
def fp_distance(i, j): return 1 - \ DataStructs.DiceSimilarity(fps[i], fps[j]) indexes = picker.LazyPick(fp_distance, n_fps, n_pick, seed=seed)
true_mols = [Chem.MolFromSmiles(s) for s in true_mols] true_mols = [x for x in true_mols if x is not None] true_fps = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, 2048) for x in true_mols ] pred_mols = [Chem.MolFromSmiles(s) for s in pred_mols] pred_mols = [x for x in pred_mols if x is not None] pred_fps = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, 2048) for x in pred_mols ] fraction_similar = 0 for i in range(len(pred_fps)): sims = DataStructs.BulkTanimotoSimilarity(pred_fps[i], true_fps) if max(sims) >= 0.4: fraction_similar += 1 print('novelty:', 1 - fraction_similar / len(pred_mols)) similarity = 0 for i in range(len(pred_fps)): sims = DataStructs.BulkTanimotoSimilarity(pred_fps[i], pred_fps[:i]) similarity += sum(sims) n = len(pred_fps) n_pairs = n * (n - 1) / 2 diversity = 1 - similarity / n_pairs print('diversity:', diversity)
def setUp(self): self.dirname = os.path.join(RDConfig.RDBaseDir, 'Code', 'DataStructs', 'testData') self.filename = os.path.join(self.dirname, 'zim.head100.fpb') self.fpbr = DataStructs.FPBReader(self.filename) self.fpbr.Init()
def testBitVectorMaxMin2(self): fps = [ "11110010101000000000", "00000000000010010000", "11001010000000000001", "00100110101000001000", "01010110000100011001", "11000110101001000011", "00000000001100001111", "00011110110000001101", "00000011011110100010", "11000010110001000000", "00000100010000010000", "10000001000010110010", "00010010000000010100", "00011100100110101000", "10001001100110100000", "10000110100110010000", "00101110000101000000", "11011101100011100000", "10000110000100101000", "00101000100000010001", "01000001000010000000", "00101101010100000110", "10001000100110110001", "00011000010100000001", "00101000001000100011", "00010000100010011001", "01100001000100010001", "10000101000001101101", "00001000011001011000", "11110000100100100000", "10100110000000011010", "00110100010110010010", "00000000000001010010", "00100000000010100001", "11110011000010001000", "10110001010100001000", "00001100100110011011", "00010010100100001110", "10100101100010100010", "01100100010100000001", "10101110011100000000", "01011000000001000001", "00000011100110100010", "01100001010001001001", "00001000000001001100", "10011001110000000100", "10110000001001100100", "00011000000001001011", "11001011010001100010", "10010000000001001011", "00010000100111100000", "00001000001110001000", "11010000010001100110", "01101001100000111000", "01001000001110111000", "10000000000100010010", "11001000010010000000", "01010010000100110001", "00010001010100100001", "01110010000000010000", "10001010000011000001", "00000110000000100100", "00010000010001000000", "11101100011010000011", "00000010100001010001", "00010000110010000101", "00010001001000111001", "01000010001100100110", "00110110000000100001", "00100010010110110010", "01000000110011001111", "00011000001000110010", "01111010101000110100", "00001010000010110110", "00110011000011011010", "00111010111010000110", "00010011101010000011", "00000001011000010000", "00011011101110110000", "00010001101000000001", "00010000001010011010", "00000010100100100010", "00000010001011000100", "11010000000001011100", "00001000110101000001", "00000010000000110010", "10000000010011000001", "11110110100100010000", "10001111000110001001", "00100110000110000100", "00000100100000100100", "00110000101100010100", "00001010100000100000", "01011000000011000111", "00010000100001010001", "10000010100000010000", "00001000000000110010", "00001000101011010001", "00011110000100100000", "11001001010001010100" ] N = 5 fps = [DataStructs.CreateFromBitString(x) for x in fps] picker = rdSimDivPickers.MaxMinPicker() mm1 = picker.LazyBitVectorPick(fps, len(fps), N) self.assertEqual(len(mm1), N) self.assertEqual(list(mm1), [37, 1, 43, 38, 16]) mm2 = picker.LazyBitVectorPick(fps, len(fps), N, useCache=False) self.assertEqual(len(mm2), N) self.assertEqual(list(mm1), list(mm2))
def test6MultiFPBReaderTani(self): basen = os.path.join(RDConfig.RDBaseDir, 'Code', 'DataStructs', 'testData') mfpbr = DataStructs.MultiFPBReader() self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.1.patt.fpb"))), 1) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.2.patt.fpb"))), 2) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.3.patt.fpb"))), 3) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.4.patt.fpb"))), 4) mfpbr.Init() self.assertEqual(mfpbr.GetNumBits(), 1024) self.assertEqual(len(mfpbr), 4) fps = "0000000000404000100000001000040000300040222000002004000240000020000000"+\ "8200010200000090000024040860070044003214820000220401054008018000226000"+\ "4800800140000042000080008008020482400000200410800000300430200800400000"+\ "0000080a0000800400010c800200648818100010880040" ebv = DataStructs.CreateFromFPSText(fps) bytes = DataStructs.BitVectToBinaryText(ebv) nbrs = mfpbr.GetTanimotoNeighbors(bytes, threshold=0.6) self.assertEqual(len(nbrs), 6) self.assertAlmostEqual(nbrs[0][0], 0.66412, 4) self.assertEqual(nbrs[0][1], 0) self.assertEqual(nbrs[0][2], 3) self.assertAlmostEqual(nbrs[1][0], 0.65289, 4) self.assertEqual(nbrs[1][1], 1) self.assertEqual(nbrs[1][2], 2) self.assertAlmostEqual(nbrs[2][0], 0.64341, 4) self.assertEqual(nbrs[2][1], 2) self.assertEqual(nbrs[2][2], 1) self.assertAlmostEqual(nbrs[3][0], 0.61940, 4) self.assertEqual(nbrs[3][1], 1) self.assertEqual(nbrs[3][2], 0) self.assertAlmostEqual(nbrs[4][0], 0.61905, 4) self.assertEqual(nbrs[4][1], 0) self.assertEqual(nbrs[4][2], 0) self.assertAlmostEqual(nbrs[5][0], 0.61344, 4) self.assertEqual(nbrs[5][1], 0) self.assertEqual(nbrs[5][2], 1) # test multi-threaded (won't do anything if the RDKit isn't compiled with threads support) nbrs = mfpbr.GetTanimotoNeighbors(bytes, threshold=0.6, numThreads=4) self.assertEqual(len(nbrs), 6) self.assertAlmostEqual(nbrs[0][0], 0.66412, 4) self.assertEqual(nbrs[0][1], 0) self.assertEqual(nbrs[0][2], 3) self.assertAlmostEqual(nbrs[1][0], 0.65289, 4) self.assertEqual(nbrs[1][1], 1) self.assertEqual(nbrs[1][2], 2) self.assertAlmostEqual(nbrs[2][0], 0.64341, 4) self.assertEqual(nbrs[2][1], 2) self.assertEqual(nbrs[2][2], 1) self.assertAlmostEqual(nbrs[3][0], 0.61940, 4) self.assertEqual(nbrs[3][1], 1) self.assertEqual(nbrs[3][2], 0) self.assertAlmostEqual(nbrs[4][0], 0.61905, 4) self.assertEqual(nbrs[4][1], 0) self.assertEqual(nbrs[4][2], 0) self.assertAlmostEqual(nbrs[5][0], 0.61344, 4) self.assertEqual(nbrs[5][1], 0) self.assertEqual(nbrs[5][2], 1)
def split(self, dataset, seed=None, frac_train=.8, frac_valid=.1, frac_test=.1, log_every_n=1000): """ Splits internal compounds into train/validation/test by fingerprint. """ np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) data_len = len(dataset) mols, fingerprints = [], [] train_inds, valid_inds, test_inds = [], [], [] from rdkit import Chem from rdkit.Chem.Fingerprints import FingerprintMols for ind, smiles in enumerate(dataset.ids): mol = Chem.MolFromSmiles(smiles, sanitize=False) mols.append(mol) fp = FingerprintMols.FingerprintMol(mol) fingerprints.append(fp) distances = np.ones(shape=(data_len, data_len)) from rdkit import DataStructs for i in range(data_len): for j in range(data_len): distances[i][j] = 1 - DataStructs.FingerprintSimilarity( fingerprints[i], fingerprints[j]) train_cutoff = int(frac_train * len(dataset)) valid_cutoff = int(frac_valid * len(dataset)) # Pick the mol closest to everything as the first element of training closest_ligand = np.argmin(np.sum(distances, axis=1)) train_inds.append(closest_ligand) cur_distances = [float('inf')] * data_len self.update_distances(closest_ligand, cur_distances, distances, train_inds) for i in range(1, train_cutoff): closest_ligand = np.argmin(cur_distances) train_inds.append(closest_ligand) self.update_distances(closest_ligand, cur_distances, distances, train_inds) # Pick the closest mol from what is left index, best_dist = 0, float('inf') for i in range(data_len): if i in train_inds: continue dist = np.sum(distances[i]) if dist < best_dist: index, best_dist = i, dist valid_inds.append(index) leave_out_indexes = train_inds + valid_inds cur_distances = [float('inf')] * data_len self.update_distances(index, cur_distances, distances, leave_out_indexes) for i in range(1, valid_cutoff): closest_ligand = np.argmin(cur_distances) valid_inds.append(closest_ligand) leave_out_indexes.append(closest_ligand) self.update_distances(closest_ligand, cur_distances, distances, leave_out_indexes) # Test is everything else for i in range(data_len): if i in leave_out_indexes: continue test_inds.append(i) return train_inds, valid_inds, test_inds
def compute_similarity(self, fp_ref): self.fp_ref = fp_ref self.similarity = DataStructs.FingerprintSimilarity( self.fp_ref, self.fp) return self.similarity
def distance(i, j): return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])
def test5FromBitString(self): s1 = '1010' bv = DataStructs.CreateFromBitString(s1) self.failUnless(len(bv) == 4) self.failUnless(list(bv.GetOnBits()) == [0, 2])
def cluster_from_mol_list(mol_list, cutoff=0.8, fp="ecfp6", activity_prop=None, summary_only=True, generate_cores=False, align_to_core=False): """Clusters the input Mol_List. Parameters: mol_list (tools.Mol_List): the input molecule list. cutoff (float): similarity cutoff for putting molecules into the same cluster. Returns: A new Mol_List containing the input molecules with their respective cluster number, as well as additionally the cluster cores, containing some statistics.""" try: fp_func = FPDICT[fp] except KeyError: print( "Fingerprint {} not found. Available fingerprints are: {}".format( fp, ", ".join(sorted(FPDICT.keys())))) return counter = Counter() # generate the fingerprints fp_list = [fp_func(mol) for mol in mol_list] # second generate the distance matrix: dists = [] num_of_fps = len(fp_list) for i in range(1, num_of_fps): sims = DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i]) dists.extend([1 - x for x in sims]) # now cluster the data: cluster_idx_list = Butina.ClusterData(dists, num_of_fps, cutoff, isDistData=True) for cluster in cluster_idx_list: counter[len(cluster)] += 1 print(" fingerprint:", fp) print(" clustersize num_of_clusters") print(" =========== ===============") for length in sorted(counter.keys(), reverse=True): print(" {:4d} {:3d}".format(length, counter[length])) print() if summary_only: return None cluster_list = tools.Mol_List() # go over each list of indices to collect the cluster's molecules for cl_id, idx_list in enumerate( sorted(cluster_idx_list, key=len, reverse=True), 1): cluster = get_mol_list_from_index_list(mol_list, idx_list, cl_id) cluster[0].SetProp( "is_repr", "yes") # The first compound in a cluster is the representative cluster_list.extend(cluster) if generate_cores: cluster_list = add_cores(cluster_list, activity_prop, align_to_core) return cluster_list
def _tanimoto_worker(k, fps): """Get per-fingerprint Tanimoto distance vector.""" # pylint: disable=no-member sims = DataStructs.BulkTanimotoSimilarity(fps[k], fps[(k + 1):]) dists_k = [1. - s for s in sims] return np.array(dists_k), 0
def taniFunc(i, j, bvs=vs): d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j]) return d
def testHashedTopologicalTorsions(self): mol = Chem.MolFromSmiles("c1ncccc1") fp1 = rdMD.GetHashedTopologicalTorsionFingerprint(mol) mol = Chem.MolFromSmiles("n1ccccc1") fp2 = rdMD.GetHashedTopologicalTorsionFingerprint(mol) self.assertEqual(DataStructs.DiceSimilarity(fp1, fp2), 1.0)
def func(i, j, bvs=vs): d = DataStructs.TanimotoSimilarity(bvs[i], bvs[j], returnDistance=True) return d