def test__init__(self): from rdkit.Chem.Fingerprints import FingerprintMols ms = [ Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC') ] fps = [FingerprintMols.FingerprintMol(x) for x in ms] self.assertAlmostEqual(FingerprintSimilarity(fps[0], fps[1]), 0.6, places=2) details = FingerprinterDetails() fpArgs = details.__dict__ fps = [] for i, x in enumerate(ms, 1): fpArgs['fpSize'] = 16 * i fps.append(FingerprintMols.FingerprintMol(x, **fpArgs)) self.assertAlmostEqual(FingerprintSimilarity(fps[0], fps[1]), 0.555, places=2) self.assertAlmostEqual(FingerprintSimilarity(fps[1], fps[0]), 0.555, places=2) fpArgs['fpSize'] = 1024 fpArgs['tgtDensity'] = 0.8 fp = FingerprintMols.FingerprintMol(ms[0], **fpArgs) self.assertEqual(len(fp), 64) fp = DataStructs.FoldToTargetDensity(fp, density=0.1, minLength=2) self.assertEqual(len(fp), 4)
def calc_similarity(similarity_method, fp1, fp2): if similarity_method == "dice": return FingerprintSimilarity( fp1, fp2, metric=DataStructs.DiceSimilarity ) else: return FingerprintSimilarity(fp1, fp2)
def get_best_structural_score(self, target_fp, metric): scores = {} for rxn_id in self.fingerprints: stru_comp, diff_comp = self.fingerprints[rxn_id] score = FingerprintSimilarity(fp1=target_fp, fp2=stru_comp, metric=metric) scores[rxn_id] = score return scores
def test_fingerprint_bit_string_readwrite(self): """ Test that the similarity of two molecules isn't changed when we write and then read the fingerprint (as a BitString) from the database. """ total = 0 for samples in self.pairs: mol_x = Chem.MolFromSmiles(samples[0], sanitize=False) mol_y = Chem.MolFromSmiles(samples[1], sanitize=False) sanitize_without_hypervalencies(mol_x) sanitize_without_hypervalencies(mol_y) fp_x = Chem.RDKFingerprint(mol_x) fp_y = Chem.RDKFingerprint(mol_y) similarity_before = FingerprintSimilarity(fp_x, fp_y) bitString_x = BitVectToText(fp_x) bitString_y = BitVectToText(fp_y) self.c.execute("INSERT INTO data VALUES (?)", (bitString_x, )) self.c.execute("INSERT INTO data VALUES (?)", (bitString_y, )) read_x = self.c.execute( f"SELECT * FROM data WHERE fingerprints='{bitString_x}'" ).fetchone()[0] read_y = self.c.execute( f"SELECT * FROM data WHERE fingerprints='{bitString_y}'" ).fetchone()[0] new_fp_x = CreateFromBitString(read_x) new_fp_y = CreateFromBitString(read_y) similarity_after = FingerprintSimilarity(new_fp_x, new_fp_y) if abs(similarity_after - similarity_before) > 0.01: print( f"similarity before = {similarity_before}, similarity after = {similarity_after}" ) total += 1 self.assertTrue(total == 0, f"Some Similarities were not equal, total={total}")
def similarity_between_two_smiles(smile_1: str, smile_2: str) -> float: try: mod_1 = MolFromSmiles(smile_1) mod_2 = MolFromSmiles(smile_2) ebv_1 = GenMACCSKeys(mod_1) ebv_2 = GenMACCSKeys(mod_2) similarity = FingerprintSimilarity(ebv_1, ebv_2) except BaseException: return -1 return round(similarity, 3)
def get_tanimoto(list1, list2): tcs = [] for fp1 in list1: for fp2 in list2: if fp1 is None or fp2 is None: tcs.append(None) else: tc = FingerprintSimilarity(fp1, fp2) tcs.append(tc) return(tcs)
def match_substrates(fragments, library): matches = [] for f in fragments: for template in library: if (FingerprintSimilarity(FingerprintMol(template[0]), FingerprintMol(f)) > 0.9): matches.append(template[1]) return matches
def get_best_structural_score(self, smarts, metric): scores = {} stru_score_high = 0 target_fp, _ = ReactionFingerprintMatcher.make_fingerprints(smarts) for rxn_id in self.fingerprints: stru_comp, _ = self.fingerprints[rxn_id] score = FingerprintSimilarity(fp1=target_fp, fp2=stru_comp, metric=metric) scores[rxn_id] = score if score > stru_score_high: stru_score_high = score return scores, stru_score_high
def test_keep_similar_samples(self): samp = self.sampler.sample(100, filter_similar=False, verbose=False) scores = list() i, j = 0, 0 while i < len(samp) - 1: j = i + 1 mol1 = Chem.MolFromSmiles(samp[i]) fp1 = GetMorganFingerprintAsBitVect(mol1, 4, nBits=1024) while j < len(samp): mol2 = Chem.MolFromSmiles(samp[j]) fp2 = GetMorganFingerprintAsBitVect(mol2, 4, nBits=1024) score = FingerprintSimilarity(fp1, fp2) scores.append(score) j += 1 i += 1 self.assertFalse(all([s < 0.85 for s in scores]))
def internal_diversity(fps, sample_size=1e4, summarise=True): """ Calculate the internal diversity, defined as the mean intra-set Tanimoto coefficient, between a set of fingerprints. For large sets, calculating the entire matrix is prohibitive, so a random set of molecules are sampled. """ tcs = [] counter = 0 while counter < sample_size: idx1 = random.randint(0, len(fps) - 1) idx2 = random.randint(0, len(fps) - 1) fp1 = fps[idx1] fp2 = fps[idx2] tcs.append(FingerprintSimilarity(fp1, fp2)) counter += 1 if summarise: return np.mean(tcs) else: return tcs
def test_correct_filter_similar_samples(self): samp = self.sampler.sample(60, filter_similar=True, threshold=0.3, verbose=False) scores = list() i, j = 0, 0 while i < len(samp) - 1: j = i + 1 mol1 = Chem.MolFromSmiles(samp[i]) fp1 = GetMorganFingerprintAsBitVect(mol1, 4, nBits=2048) while j < len(samp): mol2 = Chem.MolFromSmiles(samp[j]) fp2 = GetMorganFingerprintAsBitVect(mol2, 4, nBits=2048) score = FingerprintSimilarity(fp1, fp2) scores.append(score) j += 1 i += 1 self.assertTrue(all([s < 0.3 for s in scores]))
def external_nn(fps1, fps2, sample_size=1e3, summarise=True): """ Calculate the nearest-neighbor Tanimoto coefficient, searching one set of fingerprints against a second set. """ counter = 0 nns = [] while counter < sample_size: idx1 = random.randint(0, len(fps1) - 1) fp1 = fps1[idx1] tcs = [] for idx2 in range(len(fps2)): fp2 = fps2[idx2] tcs.append(FingerprintSimilarity(fp1, fp2)) nn = np.max(tcs) nns.append(nn) counter += 1 if summarise: return np.mean(nns) else: return nns
def compare_lig_db(fp, lig_datasets, method = 'tanimoto', same = None, same_db = ''): ''' Compares pairwise similarity between molecules from two given sets. ''' matched_ligands = {} if same: combs = (same_db, same_db) else: combs = combinations(lig_datasets.keys(), 2) for key_i, key_j in combs: print('\n' + '='*20) print(key_i, '\t', key_j) print('='*20) d_i = lig_datasets[key_i] d_j = lig_datasets[key_j] # Create the list matched = [] for k in d_i.index: for p in d_j.index: try: fp_sim = FingerprintSimilarity( d_i.loc[k, fp], d_j.loc[p, fp], metric=DataStructs.TanimotoSimilarity) if fp_sim >= 0.90: # Add to the list matched.append( {'match_mols': (d_i.loc[k, 'mol_rdk'], d_j.loc[p, 'mol_rdk']), 'match_names': (k, p), 'tanimoto': fp_sim} ) if fp_sim >= 0.98: print(k, '\t', p) except AttributeError as e: print(e, k, '\t', p) break # add to the dict matched_ligands[F'{key_i}-{key_j}'] = matched return matched_ligands
def fp_similarity(fp1, fp2, metric='tanimoto'): """ Calculate the Tanimoto similarity between two fingerprints :param fp1: {numpy array / RDKit fingerprint} Fingerprint 1 :param fp2: {numpy array / RDKit fingerprint} Fingerprint 2 :param metric: {str} which similarity metric to use, default: tanimoto; available for numpy fingerprints: tanimoto, cosine, euclidean :return: Tanimoto similarity """ if isinstance(fp1, cDataStructs.ExplicitBitVect): return FingerprintSimilarity(fp1, fp2, metric=TanimotoSimilarity) elif isinstance(fp1, np.ndarray): if metric.lower() == 'tanimoto': return tanimoto(fp1, fp2) elif metric.lower() == 'cosine': return cosine_dist(fp1, fp2) elif metric.lower() == 'euclidean': return euclidean_dist(fp1, fp2) else: raise NotImplementedError('Only the following distance metrics are available: tanimoto, cosine, euclidean') else: raise TypeError("Fingerprints must be of type numpy.ndarray or rdkit.DataStructs.cDataStructs.ExplicitBitVect")
def internal_nn(fps, sample_size=1e3, summarise=True): """ Calculate the nearest-neighbor Tanimoto coefficient within a set of fingerprints. """ counter = 0 nns = [] while counter < sample_size: idx1 = random.randint(0, len(fps) - 1) fp1 = fps[idx1] tcs = [] for idx2 in range(len(fps)): if idx1 != idx2: fp2 = fps[idx2] tcs.append(FingerprintSimilarity(fp1, fp2)) nn = np.max(tcs) nns.append(nn) counter += 1 if summarise: return np.mean(nns) else: return nns
plt.legend() plt.show() topn = result[:n, :] topn_idx = topn[:, 0].astype(np.str) chem_map = pd.read_csv("data/chem_all.csv").to_numpy() topn_smile = [chem_map[int(idx.split('_')[1]), 0] for idx in topn_idx] sm = np.zeros((n, n)) for i in range(n): for j in range(i, n): m1, m2 = Chem.MolFromSmiles(topn_smile[i]), Chem.MolFromSmiles(topn_smile[j]) sm[i, j] = FingerprintSimilarity(Chem.RDKFingerprint(m1), Chem.RDKFingerprint(m2)) sm = sm + sm.T - np.eye(n) from sklearn.cluster import AffinityPropagation af = AffinityPropagation().fit(sm) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ n_clusters_ = len(cluster_centers_indices) print("{} clusters: ".format(n_clusters_)) print(" Center index: {}".format(cluster_centers_indices.tolist())) print(" Labels: {}".format(labels.tolist())) send = {'1EVZ': ['CHEM_833', 'CHEM_84524', 'CHEM_6372', 'CHEM_28096', 'CHEM_16023'],
for line in f: zinc = line.split()[-2] if zinc in exclude: continue smile = line.split()[0] chem = Chem.MolFromSmiles(smile) chems.append(chem) mol_weights.append(ExactMolWt(chem)) sssrs.append(Chem.GetSSSR(chem)) balabanjs.append(BalabanJ(chem)) bertzcts.append(BertzCT(chem)) print_stat('Exact mol weight', mol_weights) print_stat('SSSR', sssrs) print_stat('BalabanJ', balabanjs) print_stat('BertzCT', bertzcts) fps = [Chem.RDKFingerprint(chem) for chem in chems] tanimotos = [] for i, (fp1, fp2) in enumerate(combinations(fps, 2)): tanimotos.append(FingerprintSimilarity(fp1, fp2)) print_stat('Tanimoto (RDK Fingerprint)', tanimotos) fps = [GetMorganFingerprintAsBitVect(chem, 2) for chem in chems] tanimotos = [] for i, (fp1, fp2) in enumerate(combinations(fps, 2)): tanimotos.append(FingerprintSimilarity(fp1, fp2)) print_stat('Tanimoto (Morgan Fingerprint)', tanimotos)
chem = Chem.MolFromSmiles(smile) #train2fp[name] = Chem.RDKFingerprint(chem) train2fp[name] = GetMorganFingerprintAsBitVect(chem, 2) train2common[name] = fields[0] with open('data/prediction_results.txt') as f: for line in f: fields = line.rstrip().split('\t') name = fields[3] train2fp[name] = chem2fp[name] train2common[name] = fields[4] seen_predict = set() with open(sys.argv[1]) as f: for line in f: fields = line.rstrip().split('\t') name = fields[3] if name in seen_predict: continue else: seen_predict.add(name) common = fields[4] fp = chem2fp[name] max_tan, max_chem = 0, None for chem in train2fp: tan = FingerprintSimilarity(fp, train2fp[chem]) if tan > max_tan: max_tan = tan max_chem = chem ofields = [name, common, max_chem, train2common[max_chem], max_tan] print('\t'.join([str(ofield) for ofield in ofields]))
from rdkit.DataStructs import FingerprintSimilarity # Get contents of Mainland cabinet (any dataframe could do) cabinet = get_mainland() # Get rid of any molecules without a valid SMILES stting cabinet = cabinet.dropna(subset=['SMILES']) # rdkit can't make mol from this one so drop that, too cabinet = cabinet.drop(24766) cids = list(cabinet.index) cabinet.head() # Compute all fingerprints fps = cabinet['SMILES'].apply(Chem.MolFromSmiles).apply(Chem.RDKFingerprint) # And then Tanimot distances tanimoto = pd.DataFrame(index=cids, columns=cids) tanimoto[:] = [[FingerprintSimilarity(fp1, fp2) for fp1 in fps] for fp2 in fps] tanimoto.head() # This could be any function whose first argument is integer indices (not CIDs!) into a dataframe # of info about odorants (one odorant per row) def mean_dist(indices, sim): """Return the summed Tanimoto distance of all pairs""" pairs = combinations(indices, 2) return np.mean([sim.iloc[x[0], x[1]] for x in pairs]) # Prettier printing np.set_printoptions(precision=2, suppress=True) # Some weight you can make up, each tuple is one weight under consideration. # Tuple item 1 is a cabinet dataframe column or a custom name
molecules = [] for file in filenames: molecules.append(Chem.MolFromMol2File(conv_dir+file)) #%% from rdkit.Chem.Fingerprints import FingerprintMols fingerpints = [FingerprintMols.FingerprintMol(mol) for mol in molecules] #%% from rdkit.DataStructs import FingerprintSimilarity import numpy as np simmat = np.zeros(shape=(len(molecules), len(molecules))) for i in range(len(molecules)): for j in range(i + 1, len(molecules)): simmat[i,j] = FingerprintSimilarity(fingerpints[i], fingerpints[j]) #%% import matplotlib.pyplot as plt plt.imshow(simmat) #%% hi_sim_ids0 = np.where( simmat ==1) for mol1, mol2 in zip(hi_sim_ids0[0],hi_sim_ids0[1]): print(filenames[mol1]) print(filenames[mol2]) print('=======') #%% from rdkit.Chem import MACCSkeys fingerpints2 = [MACCSkeys.GenMACCSKeys(mol) for mol in molecules]