def find_unique_confs(best_conformers, mol_files, threshold=0.5): """ Clustering conformers with RDKit's Butina algorithm to find unique conformer from a list of .sdf files using either heavy-atom root mean square deviation (RMSD) or heavy-atom torsion fingerprint deviation (TFD) """ rdkit_mol = next(rdmolfiles.ForwardSDMolSupplier(mol_files[0], sanitize=False, removeHs=True)) for mol_file in mol_files[1:]: mol = next(rdmolfiles.ForwardSDMolSupplier(mol_file, sanitize=False, removeHs=True)) rdkit_mol.AddConformer(mol.GetConformer(),assignId=True) # calculate difference matrix diffmat = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False) #threshold=0.5, sanitize=False, load AllChem # diffmat = TorsionFingerprints.GetTFDMatrix(rdkit_mol) #threshold=0.01, sanitize=True, load TorsionFingerprints # Cluster conformers num_confs = rdkit_mol.GetNumConformers() clt = Butina.ClusterData(diffmat, num_confs, threshold, isDistData=True, reordering=True) # Get unique conformers centroid_idx = [c[0] for c in clt] # centroid indexes. unique_best_conformers = [best_conformers[i] for i in centroid_idx] return unique_best_conformers
def cluster_fingerprints(fps, cutoff=0.2): """ Performs Butina clustering on compounds specified by a list of fingerprint bit vectors. From RDKit cookbook http://rdkit.org/docs_temp/Cookbook.html. Args: fps (list of rdkit.ExplicitBitVect): List of fingerprint bit vectors. cutoff (float): Cutoff distance parameter used to seed clusters in Butina algorithm. Returns: tuple of tuple: Indices of fingerprints assigned to each cluster. """ # first generate the distance matrix: dists = [] nfps = len(fps) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) # now cluster the data: cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) return cs
def cluster_conformers(mol, mode="RMSD", threshold=0.2): """ Cluster conf based on heavy atom rmsd Then Butina is used for clustering """ ### get heavy atom idx ### heavyatomidx = [] for a in mol.GetAtoms(): if a.GetAtomicNum() != 1: heavyatomidx.append(a.GetIdx()) ### align on heavy atom for each pair and get dmat ### n = mol.GetNumConformers() dmat = [] for i in range(n): for j in range(i): dmat.append( Chem.rdMolAlign.AlignMol(mol, mol, i, j, atomMap=[(k, k) for k in heavyatomidx])) ### clustering ### rms_clusters = Butina.ClusterData(dmat, mol.GetNumConformers(), threshold, isDistData=True, reordering=True) return rms_clusters
def cluster_ligands(ligands, cutoff=0.2): """""" rdkit_ligands = [] for lig in ligands: try: rdkit_ligands.append(ccdc_to_rdkit(lig)) except: pass # from RDKit Cookbook fps = [ AllChem.GetMorganFingerprintAsBitVect(lig, 2, 1024) for lig in rdkit_ligands ] # first generate the distance matrix: dists = [] nfps = len(fps) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) # now cluster the data: clusters = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) all_ligands = [] for cluster in clusters: try: all_ligands.append(rdkit_to_ccdc(rdkit_ligands[cluster[0]])) except: pass return all_ligands
def gen_cluster_subset_algButina(fps, cutoff): dists = [] for i, fp in enumerate(fps): distance_matrix = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in distance_matrix]) cs = Butina.ClusterData(dists, len(fps), cutoff, isDistData=True) return cs # returns tuple of tuples with sequential numbers of compounds in each cluster
def ClusterAlignments(mol, alignments, builder, neighborTol=0.1, distMetric=SubshapeDistanceMetric.PROTRUDE, tempConfId=1001): from rdkit.ML.Cluster import Butina dists = [] for i in range(len(alignments)): TransformMol(mol, alignments[i].transform, newConfId=tempConfId) shapeI = builder.GenerateSubshapeShape(mol, tempConfId, addSkeleton=False) for j in range(i): TransformMol(mol, alignments[j].transform, newConfId=tempConfId + 1) shapeJ = builder.GenerateSubshapeShape(mol, tempConfId + 1, addSkeleton=False) d = GetShapeShapeDistance(shapeI, shapeJ, distMetric) dists.append(d) mol.RemoveConformer(tempConfId + 1) mol.RemoveConformer(tempConfId) clusts = Butina.ClusterData(dists, len(alignments), neighborTol, isDistData=True) res = [alignments[x[0]] for x in clusts] return res
def cluster_chemicals( *, rebuild: bool = False, chemicals_dict, ): """Cluster chemicals based on their similarities.""" if not rebuild and os.path.exists(DEFAULT_CLUSTERED_CHEMICALS): return pd.read_csv(DEFAULT_CLUSTERED_CHEMICALS, sep="\t", index_col=False, dtype={'PubchemID': str}) dists = [] drugs, fps = zip(*chemicals_dict.items()) nfps = len(chemicals_dict) for i in tqdm(range(1, nfps), desc='Calculating distance for clustering'): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) cs = Butina.ClusterData(dists, nfps, 0.3, isDistData=True) df = pd.DataFrame(columns=['PubchemID', 'Cluster']) i = 1 for j, cluster in enumerate(cs, start=1): for drug in cluster: df.loc[i] = [drugs[drug - 1]] + [j] i += 1 df.to_csv(DEFAULT_CLUSTERED_CHEMICALS, sep='\t', index=False) return df
def leven_butina_cs(smiles, distThresh=3, reordering=False): cs = Butina.ClusterData(data=smiles, nPts=len(smiles), distThresh=distThresh, distFunc=levenshtein, reordering=reordering) return cs
def cluster( mol: Chem.rdchem.Mol, rms_cutoff: float = 1, already_aligned: bool = False, centroids: bool = True, ): """Cluster the conformers of a molecule according to an RMS threshold in Angstrom. Args: mol: a molecule rms_cutoff: The RMS cutoff in Angstrom. already_aligned: Whether or not the conformers are aligned. If False, they will be aligmned furing the RMS computation. centroids: If True, return one molecule with centroid conformers only. If False return a list of molecules per cluster with all the conformers of the cluster. Defaults to True. """ # Clone molecule mol = copy.deepcopy(mol) # Compute RMS dmat = AllChem.GetConformerRMSMatrix(mol, prealigned=already_aligned) # Cluster conf_clusters = Butina.ClusterData( dmat, nPts=mol.GetNumConformers(), distThresh=rms_cutoff, isDistData=True, reordering=False, ) return return_centroids(mol, conf_clusters, centroids=centroids)
def PerformButinaClustering(Mols, MolsFingerprints): """Perform clustering using Butina methodology.""" MiscUtil.PrintInfo( "\nClustering molecules using Butina methodology and %s similarity metric..." % OptionsInfo["SimilarityMetric"]) FingerprintsCount = len(MolsFingerprints) DistanceCutoff = 1 - OptionsInfo["ButinaSimilarityCutoff"] Reordering = OptionsInfo["ButinaReordering"] DistanceMatrix = GenerateLowerTriangularDistanceMatrix(MolsFingerprints) ClusteredMolIndices = Butina.ClusterData(DistanceMatrix, FingerprintsCount, DistanceCutoff, reordering=Reordering, isDistData=True) MolsClusters = [] for Cluster in ClusteredMolIndices: MolsCluster = [Mols[MolIndex] for MolIndex in Cluster] MolsClusters.append(MolsCluster) return MolsClusters
def butina_clustering_m(rdkit_mol, difference_matrix='tfd', threshold=0.001): """ Clustering conformers with RDKit's Butina algorithem """ # calculate difference matrix if difference_matrix.lower() == 'tfd': diffmat = TorsionFingerprints.GetTFDMatrix(rdkit_mol) if difference_matrix.lower() == 'rms': diffmat = AllChem.GetConformerRMSMatrix(rdkit_mol, prealigned=False) # cluster conformers num_confs = rdkit_mol.GetNumConformers() clt = Butina.ClusterData(diffmat, num_confs, threshold, isDistData=True, reordering=True) # new conformers centroid_idx = [c[0] for c in clt] # centroid indexes. new_rdkit_mol = copy.deepcopy(rdkit_mol) new_rdkit_mol.RemoveAllConformers() for idx in centroid_idx: centroid_conf = rdkit_mol.GetConformer(idx) new_rdkit_mol.AddConformer(centroid_conf, assignId=True) del rdkit_mol # delete old mol, is this nessesary? return new_rdkit_mol
def cluster_butina(self, cutoff=0.7): ''' Generate a list with cluster belongings. The cutoff variable can be used to specify the clustering threshold. ''' # make a linear input file dists = self.distance().values data = [] for i in range(len(self.names())): for j in range(i): data.append(dists[i, j]) # cluster them cluster_data = Butina.ClusterData(data, len(self.names()), cutoff, isDistData=True) # generate a list with cluster belongings cluster = [None] * len(self.names()) for i, clu in enumerate(cluster_data): for member in clu: cluster[member] = i return cluster
def ClusterFps(fps,cutoff=0.2, metric='Tanimoto'): '''Clustering Structure based on given Fingerprints. fps: Fingerprint Input for clustering. cutoff: Cutoff for Butina Clustering. metric: Available similarity metrics include: Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky. ''' from rdkit import DataStructs from rdkit.ML.Cluster import Butina metricsAvailableBulk={'tanimoto':DataStructs.BulkTanimotoSimilarity,"dice":DataStructs.BulkDiceSimilarity, "cosine": DataStructs.BulkCosineSimilarity, "sokal": DataStructs.BulkSokalSimilarity, "russel": DataStructs.BulkRusselSimilarity, "rogotGoldberg": DataStructs.BulkRogotGoldbergSimilarity, "allbit": DataStructs.BulkAllBitSimilarity, "kulczynski": DataStructs.BulkKulczynskiSimilarity, "mcconnaughey": DataStructs.BulkMcConnaugheySimilarity, "asymmetric": DataStructs.BulkAsymmetricSimilarity, "braunblanquet": DataStructs.BulkBraunBlanquetSimilarity} if metric.lower() not in metricsAvailableBulk: print "The given metric is unknown!" metric='Tanimoto' simMetricsBulk=metricsAvailableBulk[metric.lower()] # first generate the distance matrix: dists = [] nfps = len(fps) for i in range(1,nfps): sims = simMetricsBulk(fps[i],fps[:i]) dists.extend([1-x for x in sims]) # now cluster the data: cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) return cs
def cluster(smile_keys, fp_type, cutoff=0.15): #note: it seems cutoff is one - similarity coefficient, it's euclidean distance I think?? nfps = len(smile_keys) dists = [] combinations = [] data = [None] * nfps #Finger print each smile in the given smiles for i in range(0, nfps): fps = fingerprint_smile(smile_keys[i], fp_type) data[i] = fps #For each smile bulk calculate its similarity to each other smile in the list for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(data[i], data[:i]) dists.extend([1 - x for x in sims]) combinations.extend([(smile_keys[j], smile_keys[i]) for j in list(range(i))]) #Prepare export data with each combination of matrix_df = create_similarity_export_matrix(combinations, dists) #perform clustering algorithm result = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) clusters = form_cluster_with_algorithm_results(smile_keys, result) return clusters, matrix_df
def do_clustering(simm, queue_fps, threshold): """Function to peform the clustering for a library""" # Now produce the distance matric dists = [] screen_fps = [] while True: try: screen_fps.append(queue_fps.get()) except Closed: break nfps = len(screen_fps) for i in range(1, nfps): other_mols_to_scr = CloseableQueue.CloseableQueue() # Make the queues [other_mols_to_scr.put(x) for x in screen_fps[:i]] other_mols_to_scr.close() sims = [ x["values"]["similarity"] for x in simm.find_sim(screen_fps[i], other_mols_to_scr, -1.0) ] # The mol(1) is the smiles of the mol dists.extend([1 - x for x in sims]) # now cluster the data: cs = Butina.ClusterData(dists, nfps, threshold, isDistData=True) # Out mols is the list for caputring the clusters out_mols = [] # Now loop through the clusters outputing the results for i, c in enumerate(cs): for mol_ind in c: my_mol = screen_fps[mol_ind] my_mol["values"]["cluster"] = i out_mols.append(my_mol) # Now return the response return HttpResponse(json.dumps(remove_keys(out_mols)))
def ClusterFps(fps, cutoff=0.2): dists = [] nfps = len(fps) for i in range(1,nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i]) dists.extend([1-x for x in sims]) cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True) return cs
def ButinaClusteringOriginal(dists, nfps): print "-------------------------------------------------" print "starting Butina clustering" # now cluster the data: start_time = time.time() cs = Butina.ClusterData(dists, nfps, 0.7, isDistData=True, reordering=True) print "time taken: ", time.time() - start_time return cs
def test1(self): dists = [1, 2, 1, 4, 3, 2, 6, 5, 4, 2, 7, 6, 5, 3, 1] nPts = 6 cs = Butina.ClusterData(dists, nPts, 1.1, isDistData=1) self.failUnless(len(cs) == 3) self.failUnless(cs[0] == (1, 0, 2)) self.failUnless(cs[1] == (5, 4)) self.failUnless(cs[2] == (3, ))
def ClusterFps(fps, cutoff=0.2): # (ytz): this is directly copypasta'd from Greg Landrum's clustering example. dists = [] nfps = len(fps) for i in range(1, nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i]) dists.extend([1 - x for x in sims]) cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) return cs
def ClusterFps(fps, cutoff=0.2): # Calculate Tanimoto distance matrix distance_matr = Tanimoto_distance_matrix(fps) # Now cluster the data with the implemented Butina algorithm: clusters = Butina.ClusterData(distance_matr, len(fps), cutoff, isDistData=True) return clusters
def cluster_from_mol_list(mol_list, cutoff=0.8, fp="ecfp6", activity_prop=None, summary_only=True, generate_cores=False, align_to_core=False): """Clusters the input Mol_List. Parameters: mol_list (tools.Mol_List): the input molecule list. cutoff (float): similarity cutoff for putting molecules into the same cluster. Returns: A new Mol_List containing the input molecules with their respective cluster number, as well as additionally the cluster cores, containing some statistics.""" try: fp_func = FPDICT[fp] except KeyError: print("Fingerprint {} not found. Available fingerprints are: {}".format(fp, ", ".join(sorted(FPDICT.keys())))) return counter = Counter() # generate the fingerprints fp_list = [fp_func(mol) for mol in mol_list] # second generate the distance matrix: dists = [] num_of_fps = len(fp_list) for i in range(1, num_of_fps): sims = DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i]) dists.extend([1 - x for x in sims]) # now cluster the data: cluster_idx_list = Butina.ClusterData(dists, num_of_fps, cutoff, isDistData=True) for cluster in cluster_idx_list: counter[len(cluster)] += 1 print(" fingerprint:", fp) print(" clustersize num_of_clusters") print(" =========== ===============") for length in sorted(counter.keys(), reverse=True): print(" {:4d} {:3d}".format(length, counter[length])) print() if summary_only: return None cluster_list = tools.Mol_List() # go over each list of indices to collect the cluster's molecules for cl_id, idx_list in enumerate(sorted(cluster_idx_list, key=len, reverse=True), 1): cluster = get_mol_list_from_index_list(mol_list, idx_list, cl_id) cluster[0].SetProp("is_repr", "yes") # The first compound in a cluster is the representative cluster_list.extend(cluster) if generate_cores: cluster_list = add_cores(cluster_list, activity_prop, align_to_core) return cluster_list
def test4(self): " edge case: everything in one cluster " dists = [1, 2,1, 3,2,1, ] nPts = 4 cs = Butina.ClusterData(dists,nPts,2,isDistData=1) self.assertTrue(len(cs)==1) self.assertTrue(cs[0]==(3,0,1,2))
def test8_reordering_changes(self): # " reordering: changes" dists = [ 2, 3.5, 1.5, 5, 3, 1.5, 7, 5, 3.5, 2, 8, 6, 4.5, 3, 1, 9, 7, 5.5, 4, 2, 1, ] nPts = 7 # without reordering cs = Butina.ClusterData(dists, nPts, 2.1, isDistData=1) self.assertTrue(len(cs) == 3) self.assertTrue(cs[0] == (4, 3, 5, 6)) self.assertTrue(cs[1] == (2, 1)) self.assertTrue(cs[2] == (0, )) # with reordering cs = Butina.ClusterData(dists, nPts, 2.1, isDistData=1, reordering=True) self.assertTrue(len(cs) == 2) self.assertTrue(cs[0] == (4, 3, 5, 6)) self.assertTrue(cs[1] == (1, 0, 2))
def ClusterFps(fps,cutoff=0.2): # first generate the distance matrix: dists = [] nfps = len(fps) for i in range(1,nfps): sims = DataStructs.BulkTanimotoSimilarity(fps[i],fps[:i]) dists.extend([1-x for x in sims]) # now cluster the data: cs = Butina.ClusterData(dists,nfps,cutoff,isDistData=True) return cs
def cluster_conformers(mol, mode="RMSD", threshold=2.0): if mode == "TFD": dmat = TorsionFingerprints.GetTFDMatrix(mol) else: dmat = AllChem.GetConformerRMSMatrix(mol, prealigned=False) rms_clusters = Butina.ClusterData(dmat, mol.GetNumConformers(), threshold, isDistData=True, reordering=True) return rms_clusters
def ClusterFps_Butina(self, dists, nfps, cutoff): self.cdict = {} cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) for index, eachcs in enumerate(cs): self.clustdict[index + 1] = eachcs for eachid in eachcs: self.cdict[eachid] = [index + 1] if eachid == eachcs[0]: self.cdict[eachid].append("true") else: self.cdict[eachid].append("flase")
def cluster_fingerprints(fingerprints, cutoff=0.2): from rdkit import DataStructs from rdkit.ML.Cluster import Butina dists = [] length = len(fingerprints) for i in range(1, length): sims = DataStructs.BulkTanimotoSimilarity(fingerprints[i], fingerprints[:i]) dists.extend([1 - x for x in sims]) return Butina.ClusterData(dists, length, cutoff, isDistData=True)
def test3(self): " edge case: everything a singleton " dists = [1, 2,1, ] nPts = 3 cs = Butina.ClusterData(dists,nPts,0.9,isDistData=1) self.assertTrue(len(cs)==3) self.assertTrue(cs[0]==(2,)) self.assertTrue(cs[1]==(1,)) self.assertTrue(cs[2]==(0,))
def test6(self): " edge case: zero distances: " dists = [1, 2,0, 2,0,0, 4,2,2,2, ] nPts = 5 cs = Butina.ClusterData(dists,nPts,0.9,isDistData=1) self.assertTrue(len(cs)==3) self.assertTrue(cs[0]==(3,1,2)) self.assertTrue(cs[1]==(4,)) self.assertTrue(cs[2]==(0,))
def test4(self): " edge case: one in the middle leaves the edges lonely " dists = [1.5, 2.5,1, 3.5,2,1, 5,3.5,2.5,1.5, ] nPts = 5 cs = Butina.ClusterData(dists,nPts,1.1,isDistData=1) self.assertTrue(len(cs)==3) self.assertTrue(cs[0]==(2,1,3)) self.assertTrue(cs[1]==(4,)) self.assertTrue(cs[2]==(0,))