def make_matrix(file1, file2): res_df = pd.DataFrame(file1) list_smiles1 = list(file1.canonical_smiles) list_smiles1.append(file2) list_smiles = [Chem.MolFromSmiles(x) for x in list_smiles1] list_ids = list(res_df.name) list_ids.append("My_query") my_fps = [FingerprintMols.FingerprintMol(x) for x in list_smiles] dists = [] simil = [] nfps = len(my_fps) for j in range(0, nfps): simil.append(DataStructs.BulkTanimotoSimilarity(my_fps[j], my_fps)) res_dis = DataStructs.BulkTanimotoSimilarity(my_fps[j], my_fps, returnDistance=1) dists.append([1 - x for x in res_dis]) simil_mat = np.array(simil) dist_mat = np.array(dists) df_dist = pd.DataFrame(dist_mat) df_simil = pd.DataFrame(simil_mat) df_simil.columns = list_ids df_simil.index = list_ids return df_simil
def do_sim(i1,i2,intra=False): ret = [] for i in i1: if intra: sims = DataStructs.BulkTanimotoSimilarity(i[1],i2[:i[0]] + i2[i[0]+1:]) else: sims = DataStructs.BulkTanimotoSimilarity(i[1],i2) if options.topn ==1: ret.append(max(sims)) else: ret.append(np.average(sorted(sims,reverse=True)[:options.topn])) return ret
def dmat_sim(smiles_target, ntopick=10): """ Function to select most dissimilar compounds from a given set Adapted from: http://rdkit.blogspot.com/2014/08/optimizing-diversity-picking-in-rdkit.html Args: smiles_target: DataFrame which contains compound-target activity pairs. The compounds should be in the smiles strings format and in a column named "smiles" ntoppick: The number of dissimiliar compounds to pick from the ranked list of dissimilarity Returns: A DataFrame of compound-target activity pairs that were sampled from the input smiles_target DataFrame based on their dissimilarity """ ds = [] smiles_target.reset_index(drop=True, inplace=True) mols = [MolFromSmiles(smi) for smi in smiles_target['smiles']] fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, 2) for m in mols] for i in range(1, len(fps)): ds.extend( DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i], returnDistance=True)) mmp = SimDivFilters.MaxMinPicker() ids = mmp.Pick(np.array(ds), len(fps), ntopick) smiles_target_dissim = smiles_target.iloc[list(ids)] return smiles_target_dissim
def cluster(smile_keys, fp_type, cutoff=0.15): #note: it seems cutoff is one - similarity coefficient, it's euclidean distance I think?? nfps = len(smile_keys) dists = [] combinations = [] data = [None] * nfps #Finger print each smile in the given smiles for i in range(0, nfps): fps = fingerprint_smile(smile_keys[i], fp_type) data[i] = fps #For each smile bulk calculate its similarity to each other smile in the list for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(data[i], data[:i]) dists.extend([1 - x for x in sims]) combinations.extend([(smile_keys[j], smile_keys[i]) for j in list(range(i))]) #Prepare export data with each combination of matrix_df = create_similarity_export_matrix(combinations, dists) #perform clustering algorithm result = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) clusters = form_cluster_with_algorithm_results(smile_keys, result) return clusters, matrix_df
def select_best(self, scale=0.3, g=None): ''' Selects the best 50 smiles from a generation. Adjusted score to boost small and diverse molecs. Inputs: * scale: exponential weight for adjusted coefficients. * n: int. optional. generation ot select best molecs from. ''' if g is None: g = self.gen_counter indexs = self.index_table["gen_ev"] == g candidates = self.index_table[indexs] # prepare features mols = [Chem.MolFromSmiles(x) for x in candidates["smiles"].values] fingerprints = [Chem.RDKFingerprint(mol) for mol in mols] # calculate adjusted coefficients weight_coeff = [(900 / Descriptors.MolWt(mol))**scale for mol in mols] similarity_coeff = [] for i in range(len(mols)): max_sim = np.max( DataStructs.BulkTanimotoSimilarity( fingerprints[i], [fingerprints[x] for x in range(len(mols)) if x != i])) similarity_coeff.append((1 / max_sim)**scale) adjusted_coeff = candidates["best"].values *\ np.array(weight_coeff) * np.array(similarity_coeff) # add overall score self.index_table.loc[indexs, "w_adj"] = weight_coeff self.index_table.loc[indexs, "s_adj"] = similarity_coeff self.index_table.loc[indexs, "adj"] = adjusted_coeff # select best values + add to best historic gen_best = self.index_table[indexs].sort_values("adj")["smiles"].values self.historic_best += list(gen_best[:self.retain])
def cluster_fingerprints(fps, cutoff=0.2): """ Performs Butina clustering on compounds specified by a list of fingerprint bit vectors. From RDKit cookbook http://rdkit.org/docs_temp/Cookbook.html. Args: fps (list of rdkit.ExplicitBitVect): List of fingerprint bit vectors. cutoff (float): Cutoff distance parameter used to seed clusters in Butina algorithm. Returns: tuple of tuple: Indices of fingerprints assigned to each cluster. """ # first generate the distance matrix: dists = [] nfps = len(fps) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) # now cluster the data: cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) return cs
def tanimoto_candidates(target, steroidlist): '''given a list of compounds, will compare to your target''' steroidmols = [Chem.MolFromSmiles(i) for i in steroidlist] for m in steroidmols: AllChem.Compute2DCoords(m) steroidlist_fps = [ AllChem.GetMorganFingerprintAsBitVect(x, 2) for x in steroidmols ] #recombine endogenous structures with their scores sims = DataStructs.BulkTanimotoSimilarity(steroidlist_fps[0], steroidlist_fps) nbrs = sorted(zip(sims, steroidmols), reverse=True) #grab bottom 10% of matches negative_structures = [x[1] for x in nbrs[:20]] negative_smiles = [] for i in negative_structures: negative_smiles.append(Chem.MolToSmiles(i)) nbrs_filtered = [] for i in nbrs: if i[0] > .40: nbrs_filtered.append(i) #Draw.MolsToGridImage([x[1] for x in nbrs_filtered[:]],legends=['%.4f'%x[0] for x in nbrs_filtered]) return nbrs_filtered
def cluster_ligands(ligands, cutoff=0.2): """""" rdkit_ligands = [] for lig in ligands: try: rdkit_ligands.append(ccdc_to_rdkit(lig)) except: pass # from RDKit Cookbook fps = [ AllChem.GetMorganFingerprintAsBitVect(lig, 2, 1024) for lig in rdkit_ligands ] # first generate the distance matrix: dists = [] nfps = len(fps) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) # now cluster the data: clusters = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) all_ligands = [] for cluster in clusters: try: all_ligands.append(rdkit_to_ccdc(rdkit_ligands[cluster[0]])) except: pass return all_ligands
def check_mol(option, maps=None, out_put=None, target_id=None, extra=None): """Function to check whether an input is either a valid smiles or a valid protein code Takes a string and a Target Returns an answer to be used by jquery""" my_mols = Molecule.objects.filter(prot_id__code__icontains=option) target = Target.objects.get(pk=target_id) if len(my_mols) == 0: tmpmol = Chem.MolFromSmiles(str(option)) if tmpmol is None: return "None molecule" # Now do a similarity search on this against all the molecules cmps = [ Chem.MolFromSmiles(str(x)) for x in Molecule.objects.filter( prot_id__target_id=target_id).exclude( prot_id__code__startswith=target.title).exclude( cmpd_id__smiles="DUMMY").values_list("cmpd_id__smiles", flat=True) ] fps = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in cmps ] sims = DataStructs.BulkTanimotoSimilarity( AllChem.GetMorganFingerprintAsBitVect(tmpmol, 2, nBits=1024), fps) ind = max(enumerate(sims), key=lambda x: x[1])[0] mycmp = cmps[ind] my_mols = Molecule.objects.filter( cmpd_id__smiles=Chem.MolToSmiles(mycmp, isomericSmiles=True)) # Now return the appropriate PDBcode return my_mols[0].prot_id.code
def cluster_chemicals( *, rebuild: bool = False, chemicals_dict, ): """Cluster chemicals based on their similarities.""" if not rebuild and os.path.exists(DEFAULT_CLUSTERED_CHEMICALS): return pd.read_csv(DEFAULT_CLUSTERED_CHEMICALS, sep="\t", index_col=False, dtype={'PubchemID': str}) dists = [] drugs, fps = zip(*chemicals_dict.items()) nfps = len(chemicals_dict) for i in tqdm(range(1, nfps), desc='Calculating distance for clustering'): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) cs = Butina.ClusterData(dists, nfps, 0.3, isDistData=True) df = pd.DataFrame(columns=['PubchemID', 'Cluster']) i = 1 for j, cluster in enumerate(cs, start=1): for drug in cluster: df.loc[i] = [drugs[drug - 1]] + [j] i += 1 df.to_csv(DEFAULT_CLUSTERED_CHEMICALS, sep='\t', index=False) return df
def compare_tanimoto_fingerprints_pairwise(smiles, fingerprints): """ Arguments: smiles (List): List of smiles you would like to compare fingerprints (List): List of fingerprint RDKit objects for each smiles (should directly correlate) Returns: similarity_dataframe (Pandas Dataframe Object): a dataframe containing pairwise similarity. """ query, target, similarity = [], [], [] for index, fingerprint in enumerate(fingerprints): similarity_group = DataStructs.BulkTanimotoSimilarity(fingerprint, fingerprints[index+1:]) for i in range(len(similarity_group)): query.append(combinations[index]) target.append(combinations[index+1:][i]) similarity.append(similarity_group[i]) # build the dataframe and sort it similarity_data = {'query':query, 'target':target, 'similarity':similarity} similarity_dataframe = pd.DataFrame(data=similarity_data).sort_values('similarity', ascending=False) return similarity_dataframe
def get_qnu(sgs_tuples, ref_path): novel_tuples = [] pred_actives = [] for tup in sgs_tuples: ra, mol, x, y, qed, sa = tup if topk_func((x, y, qed, sa)): # pred_actives = [mol for ra, mol, x, y, qed, sa in sgs_tuples )] pred_actives.append(mol) with open(ref_path) as f: next(f) true_actives = set( [get_canonical_smiles(line.split(',')[0]) for line in f]) print('number of active reference', len(true_actives)) all_set = set() true_fps = to_fingerprints(true_actives) pred_fps = to_fingerprints(pred_actives) for i in range(len(pred_actives)): sims = DataStructs.BulkTanimotoSimilarity(pred_fps[i], true_fps) canon_smiles = get_canonical_smiles(pred_actives[i]) if canon_smiles not in all_set and max(sims) < 0.4: all_set.add(canon_smiles) novel_tuples.append(sgs_tuples[i]) print('QNU {} -> {}'.format(len(sgs_tuples), len(novel_tuples))) return len(novel_tuples) / len(sgs_tuples)
def distance_matrix(self): self.dist = [] nfps = len(self.fplist) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(self.fplist[i], self.fplist[:i]) self.dist.extend([1 - x for x in sims])
def compute_similarity(fp_pred, fp_train, mols_pred, mols_train): results = [] for i, fp in enumerate(tqdm(fp_pred)): dist = DataStructs.BulkTanimotoSimilarity(fp, fp_train) idx = int(np.argmax(dist)) results.append((mols_pred[i], mols_train[idx], dist[idx])) return results
def dimension(fnames, fp='ECFP', alg='PCA', maximum=int(1e5), ref='GPCR'): df = pd.DataFrame() for i, fname in enumerate(fnames): sub = pd.read_table(fname).dropna(subset=['Smiles']) sub = sub[sub.VALID == True] if maximum is not None and len(sub) > maximum: sub = sub.sample(maximum) if ref not in fname: sub = sub[sub.DESIRE == True] sub = sub.drop_duplicates(subset='Smiles') sub['LABEL'] = i df = df.append(sub) if fp == 'similarity': ref = df[(df.LABEL == 0) & (df.DESIRE == True)] refs = Predictor.calc_ecfp(ref.Smiles) fps = Predictor.calc_ecfp(df.Smiles) from rdkit.Chem import DataStructs fps = np.array( [DataStructs.BulkTanimotoSimilarity(fp, refs) for fp in fps]) else: fp_alg = Predictor.calc_ecfp if fp == 'ECFP' else Predictor.calc_physchem fps = fp_alg(df.Smiles) fps = Scaler().fit_transform(fps) pca = PCA(n_components=2) if alg == 'PCA' else TSNE(n_components=2) xy = pca.fit_transform(fps) df['X'], df['Y'] = xy[:, 0], xy[:, 1] if alg == 'PCA': ratio = pca.explained_variance_ratio_[:2] return df, ratio else: return df
def _compute_diversity(mol, fps): ref_fps = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, 4, nBits=2048) dist = DataStructs.BulkTanimotoSimilarity( ref_fps, fps, returnDistance=True) score = np.mean(dist) return score
def findCluster(self, smiles): mol = Chem.MolFromSmiles(smiles) if not mol: return "", "", False if self.bits > 0: fp = AllChem.GetMorganFingerprintAsBitVect( mol, self.radius, nBits=self.bits, useFeatures=self.useFeatures) else: fp = AllChem.GetMorganFingerprint(mol, self.radius, useFeatures=self.useFeatures) if smiles in self.getFingerprints(): return smiles, fp, False fps = list(self.getFingerprints().values()) sims = DataStructs.BulkTanimotoSimilarity(fp, fps) if len(sims) == 0: return smiles, fp, True closest = np.argmax(sims) if sims[closest] >= self.minsimilarity: return list(self.getFingerprints().keys())[closest], fp, False else: return smiles, fp, True
def findCluster(self, smiles): mol = Chem.MolFromSmiles(smiles) if mol: try: scaffold = MurckoScaffold.GetScaffoldForMol(mol) except: return "", "", False if scaffold: cluster = Chem.MolToSmiles(scaffold, isomericSmiles=False) else: return "", "", False else: return "", "", False fp = Pairs.GetAtomPairFingerprint(scaffold) # Change to Tanimoto? if cluster in self.getFingerprints(): return cluster, fp, False fps = list(self.getFingerprints().values()) sims = DataStructs.BulkTanimotoSimilarity(fp, fps) if len(sims) == 0: return cluster, fp, True closest = np.argmax(sims) if sims[closest] >= self.minsimilarity: return list(self.getFingerprints().keys())[closest], fp, False else: return cluster, fp, True
def doSimSearch(model_name): if os.name == 'nt': sep = '\\' else: sep = '/' mod = model_name.split(sep)[-1].split('.')[0] try: with zipfile.ZipFile( os.path.dirname(os.path.abspath(__file__)) + sep + 'actives' + sep + mod + '.smi.zip', 'r') as zfile: comps = [ i.split('\t') for i in zfile.open(mod + '.smi', 'r').read().splitlines() ] except IOError: return comps2 = [] afp = [] for comp in comps: try: afp.append(calcFingerprints(comp[1])) comps2.append(comp) except: pass ret = [] for i, fp in enumerate(querymatrix): sims = DataStructs.BulkTanimotoSimilarity(fp, afp) idx = sims.index(max(sims)) ret.append([sims[idx], mod] + comps2[idx] + [smiles[i]]) return ret
def test6BulkTversky(self): """ """ sz = 10 nToSet = 5 nVs = 6 import random vs = [] for i in range(nVs): v = ds.IntSparseIntVect(sz) for j in range(nToSet): v[random.randint(0, sz - 1)] = random.randint(1, 10) vs.append(v) baseDs = [ds.TverskySimilarity(vs[0], vs[x], .5, .5) for x in range(1, nVs)] bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 0.5, 0.5) diceDs = [ds.DiceSimilarity(vs[0], vs[x]) for x in range(1, nVs)] for i in range(len(baseDs)): self.assertTrue(feq(baseDs[i], bulkDs[i])) self.assertTrue(feq(baseDs[i], diceDs[i])) bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 1.0, 1.0) taniDs = [ds.TanimotoSimilarity(vs[0], vs[x]) for x in range(1, nVs)] for i in range(len(bulkDs)): self.assertTrue(feq(bulkDs[i], taniDs[i])) taniDs = ds.BulkTanimotoSimilarity(vs[0], vs[1:]) for i in range(len(bulkDs)): self.assertTrue(feq(bulkDs[i], taniDs[i]))
def calculate_pairwise_similarities(smiles_list1: List[str], smiles_list2: List[str]) -> np.array: """ Computes the pairwise ECFP4 tanimoto similarity of the two smiles containers. Returns: Pairwise similarity matrix as np.array """ if len(smiles_list1) > 10000 or len(smiles_list2) > 10000: logger.warning(f'Calculating similarity between large sets of ' f'SMILES strings ({len(smiles_list1)} x {len(smiles_list2)})') mols1 = get_mols(smiles_list1) fps1 = get_fingerprints(mols1) mols2 = get_mols(smiles_list2) fps2 = get_fingerprints(mols2) similarities = [] for fp1 in fps1: sims = DataStructs.BulkTanimotoSimilarity(fp1, fps2) similarities.append(sims) similarities = np.array(similarities) return similarities
def se_cs(fps, distThresh): lp = SimDivFilters.rdSimDivPickers.LeaderPicker() picks = lp.LazyBitVectorPick(fps, len(fps), distThresh) cs = defaultdict(list) # Assign each centroid as first item in list for i, idx in enumerate(picks): cs[i].append(idx) # Prepare similarity matrix sims = np.zeros((len(picks), len(fps))) # For each pick for i in range(len(picks)): pick = picks[i] # Assign bulk similarity to row sims[i, :] = DataStructs.BulkTanimotoSimilarity(fps[pick], fps) # Assign similarity to self as 0, so as not to pick yourself sims[i, i] = 0 # Find snn to each pick best = np.argmax(sims, axis=0) # For each snn for i, idx in enumerate(best): # If it's not already a centroid if i not in picks: # Assign to nearest centroid... cs[idx].append(i) return [cs[k] for k in cs]
def variety(mol, setfps): low_rand_dst = 0.9 mean_div_dst = 0.945 fp = Chem.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048) dist = DataStructs.BulkTanimotoSimilarity(fp, setfps, returnDistance=True) mean_dist = np.mean(np.array(dist)) return mean_dist
def _get_tanimoto_distance_matrix(fingerprints): """ Calculate distance matrix for list of fingerprints. Parameters ---------- fingerprints : list of rdkit.DataStructs.cDataStructs.ExplicitBitVect List of fingerprints. Returns ------- list of floats Distance matrix (a triangular distance matrix in the form of a list) """ fingerprints = list(fingerprints) distance_matrix = [] for i in range(1, len(fingerprints)): # Calculate Tanimoto similarity between fingerprints similarities = DataStructs.BulkTanimotoSimilarity( fingerprints[i], fingerprints[:i]) # Since we need a distance matrix, calculate 1-x for every element in similarity matrix distance_matrix.extend([1 - x for x in similarities]) return distance_matrix
def gen_cluster_subset_algButina(fps, cutoff): dists = [] for i, fp in enumerate(fps): distance_matrix = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in distance_matrix]) cs = Butina.ClusterData(dists, len(fps), cutoff, isDistData=True) return cs # returns tuple of tuples with sequential numbers of compounds in each cluster
def bulk_tanimoto_distance(smile, fps): ref_mol = Chem.MolFromSmiles(smile) ref_fps = AllChem.GetMorganFingerprintAsBitVect(ref_mol, 4, nBits=2048) dist = DataStructs.BulkTanimotoSimilarity(ref_fps, fps, returnDistance=True) return dist
def get_scores(self, mols, smiles2score=None): ''' @params: mols: molecules to estimate score @return: dicts (list): list of score dictionaries ''' if 'nov' in self.objectives or 'div' in self.objectives: fps_mols = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, 2048) for x in mols ] dicts = [{} for _ in mols] for obj in self.objectives: if obj == 'adv': continue if obj == 'nov': for i, fp in enumerate(fps_mols): sims = DataStructs.BulkTanimotoSimilarity(fp, self.fps_ref) dicts[i][obj] = 1. - max(sims) continue if obj == 'div': for i, fp in enumerate(fps_mols): sims = DataStructs.BulkTanimotoSimilarity(fp, fps_mols) dicts[i][obj] = 1. - 1. * sum(sims) / len(fps_mols) continue scores = get_scores(obj, mols, smiles2score) for i, mol in enumerate(mols): dicts[i][obj] = scores[i] if 'adv' in self.objectives: graphs = [mol_to_dgl(mol) for mol in mols] dataset = GraphDataset(graphs) loader = DataLoader(dataset, batch_size=self.batch_size, collate_fn=GraphDataset.collate_fn) preds = [] for batch in loader: with torch.no_grad(): pred = self.discriminator(batch) # (batch_size, 2) pred = F.softmax(pred, dim=1) # (batch_size, 2) preds.append(pred[:, 1]) # (batch_size,) preds = torch.cat(preds, dim=0).tolist() # (num_mols,) for i, pred in enumerate(preds): dicts[i]['adv'] = pred return dicts
def ClusterFps(fps, cutoff=0.2): dists = [] nfps = len(fps) for i in range(1,nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i]) dists.extend([1-x for x in sims]) cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True) return cs
def tanimoto_1d(fps): ds = [] for i in range(1, len(fps)): ds.extend( DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i], returnDistance=True)) return ds
def calc_sims(fps_1, fps_2): sims = [] for i in range(0, len(fps_1)): #sim = DataStructs.BulkTanimotoSimilarity(fps_1[i], [x for n,x in enumerate(fps_2) if n!= i]) within self sim = DataStructs.BulkTanimotoSimilarity( fps_1[i], fps_2) # for two different arrays sims.append(sim) return sims