def compute_tsne(self): Database = self.Database2 smiles = list(Database["SMILES"]) smi = [Chem.MolFromSmiles(x) for x in smiles] fps = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in smi] tanimoto_sim_mat_lower_triangle = GetTanimotoSimMat(fps) n_mol = len(fps) similarity_matrix = np.ones([n_mol, n_mol]) i_lower = np.tril_indices(n=n_mol, m=n_mol, k=-1) i_upper = np.triu_indices(n=n_mol, m=n_mol, k=1) similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle similarity_matrix[i_upper] = similarity_matrix.T[i_upper] distance_matrix = np.subtract(1, similarity_matrix) TSNE_sim = TSNE( n_components=2, init='pca', random_state=1992, angle=0.3, perplexity=self.perplexity).fit_transform(distance_matrix) tsne_result = pd.DataFrame(data=TSNE_sim, columns=["PC1", "PC2"]) tsne_result["LIBRARY"] = list(Database.LIBRARY) tsne_result["TIPO"] = list(Database.LIBRARY) tsne_result["SMILES"] = list(Database.SMILES) tsne_result["NAME"] = list(Database.NAME) self.tsne_result = tsne_result.set_index('TIPO')
def compute_pca(self): Database = self.Database2 smiles = list(Database.SMILES) smi = [Chem.MolFromSmiles(x) for x in smiles] fps = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in smi] tanimoto_sim_mat_lower_triangle = GetTanimotoSimMat(fps) n_mol = len(fps) similarity_matrix = np.ones([n_mol, n_mol]) i_lower = np.tril_indices(n=n_mol, m=n_mol, k=-1) i_upper = np.triu_indices(n=n_mol, m=n_mol, k=1) similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle similarity_matrix[i_upper] = similarity_matrix.T[i_upper] sklearn_pca = sklearn.decomposition.PCA(n_components=2, svd_solver="full", whiten=True) sklearn_pca.fit(similarity_matrix) variance = list(sklearn_pca.explained_variance_ratio_) a = round(variance[0] * 100, 2) b = round(variance[1] * 100, 2) pca_result = pd.DataFrame(sklearn_pca.transform(similarity_matrix), columns=['PC1', 'PC2']) pca_result["LIBRARY"] = Database.LIBRARY pca_result["TIPO"] = Database.LIBRARY pca_result["SMILES"] = Database.SMILES pca_result["NAME"] = Database.NAME self.pca_result = pca_result.set_index('TIPO') variance = list(sklearn_pca.explained_variance_ratio_) self.a = round(variance[0] * 100, 2) self.b = round(variance[1] * 100, 2) return pca_result
def atom_pairs(): """ Atom pair fingerprints, atom descriptor """ # Generate molecules ms = [ Chem.MolFromSmiles('C1CCC1OCC'), Chem.MolFromSmiles('CC(C)OCC'), Chem.MolFromSmiles('CCOCC') ] pairFps = [Pairs.GetAtomPairFingerprint(x) for x in ms] # Get the list of bits and their counts for each fingerprint as a dictionary d = pairFps[-1].GetNonzeroElements() print(d) # Explanation of the bitscore. print(Pairs.ExplainPairScore(558115)) # Dice similarity; The usual metric for similarity between atom-pair fingerprints print(DataStructs.DiceSimilarity(pairFps[0], pairFps[1])) # Atom decriptor without count pairFps = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in ms] print(DataStructs.DiceSimilarity(pairFps[0], pairFps[1]))
def fingerprint_smile(smile, fp_type): murcko = get_murcko_smile(smile) mol = Chem.MolFromSmiles(murcko) if fp_type == "atom-pair": fps = Pairs.GetAtomPairFingerprintAsBitVect(mol) elif fp_type == "maccs": fps = MACCSkeys.GenMACCSKeys(mol) else: fps = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=1024) return fps
def fingerprint(mol, fp_type="DL"): if fp_type == "DL": return FingerprintMols.FingerprintMol(mol) elif fp_type == "circular": return AllChem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024) elif fp_type == "MACCS": return MACCSkeys.GenMACCSKeys(mol) elif fp_type == "torsions": return Pairs.GetAtomPairFingerprintAsBitVect(mol) elif fp_type == "pharm": return Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory)
def atom_fp(Library): ms = list() sim = list() y = list() random.seed(43) N=round(len(Library)*.2) X = random.sample(Library,N) ms=[Chem.MolFromSmiles(i) for i in X] fps_atom = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in ms] Atom = [DataStructs.FingerprintSimilarity(y,x) for x,y in it.combinations(fps_atom,2)] Atom.sort() sim = Atom y= np.arange(1, len(sim) + 1)/ len(sim) return sim, y
def calculate_atom_pair_fp(molecular_df, col): """ Calculates atom pair fingerprint :param molecular_df: pandas data frame containing molecules :param col: column with molecules present :return: """ fps = [] for index, row in molecular_df.iterrows(): try: mol = Chem.MolFromSmiles(row[col]) fp = Pairs.GetAtomPairFingerprintAsBitVect(mol) fps.append(fp) except: fps.append('N/A') molecular_df['atom_pair_fp'] = fps return molecular_df
def Fingerprints(mols, fingerprint): # Indigo fingerprints if fingerprint in indigofps: return [mol.fingerprint(fingerprint) for mol in mols] # RDKit fingerprints if fingerprint in rdkitfps: if fingerprint == "atompair": return [Pairs.GetAtomPairFingerprintAsBitVect(mol) for mol in mols] elif fingerprint == "avalon": return [pyAvalonTools.GetAvalonFP(mol) for mol in mols] elif fingerprint == "daylight": return [Chem.RDKFingerprint(mol, fpSize=2048) for mol in mols] elif fingerprint == "maccs": return [MACCSkeys.GenMACCSKeys(mol) for mol in mols] elif fingerprint == "morgan": return [(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)) for mol in mols] elif fingerprint == "pharm2d": return [ Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory) for mol in mols ] elif fingerprint == "topological": return [FingerprintMols.FingerprintMol(mol) for mol in mols] # RDKit non-bit (integer or float) fingerprints if fingerprint in rdkitnonbitfps: if fingerprint == "sheridan": return [Sheridan.GetBPFingerprint(mol) for mol in mols] elif fingerprint == "topotorsion": return [ Torsions.GetTopologicalTorsionFingerprint(mol) for mol in mols ] # E-state fingerprints if fingerprint in rdkitestatefps: if fingerprint == "estate1": return [Fingerprinter.FingerprintMol(mol)[0] for mol in mols] elif fingerprint == "estate2": return [Fingerprinter.FingerprintMol(mol)[1] for mol in mols] # unknown fingerprint return None
def atom_pairs_fp(SMILES, Library): ms = [Chem.MolFromSmiles(i) for i in SMILES] fp = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in ms] sim = [DataStructs.FingerprintSimilarity(y, x) for x, y in it.combinations(fp, 2)] sim.sort() # sim = MACCKeys y = np.arange(1, len(sim) + 1) / len(sim) # eje y#estatistical values stat = { "MIN": [round(min(sim), 2)], "1Q": [round(np.percentile(sim, 25))], "MEDIAN": [round(st.median(sim))], "MEAN": [round(st.mean(sim), 2)], "3Q": [round(np.percentile(sim, 75), 2)], "MAX": [max(sim)], "STD": [round(st.stdev(sim), 2)], "Library": [str(Library)], } df = pd.DataFrame.from_dict(stat) fp_result = {"sim": sim, "y": np.arange(1, len(sim) + 1) / len(sim), "df": df} return fp_result
def ATOMPAIRSfpDataFrame(chempandas, namecol, smicol): """ AtomPairs-based fingerprints 2048 bits. """ assert chempandas.shape[0] <= MAXLINES molsmitmp = [Chem.MolFromSmiles(x) for x in chempandas.iloc[:, smicol]] i = 0 molsmi = [] for x in molsmitmp: if x is not None: x.SetProp("_Name", chempandas.iloc[i, namecol]) molsmi.append(x) i += 1 # ATOMPAIRS Fingerprints. fps = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in molsmi] fpsmat = np.matrix(fps) df = DataFrame(fpsmat, index=[x.GetProp("_Name") for x in molsmi]) # how to name the col? df['SMILES'] = [Chem.MolToSmiles(x) for x in molsmi] df['CHEMBL'] = df.index return (df)
def _getFPSStream(f, mols, type='morgan', radius=2, n_bits=2048): f.write("#FPS1\n#num_bits=%s\n#software=RDKit/%s\n" % (n_bits, rdBase.rdkitVersion)) for i, mol in enumerate(mols): if mol: idx = i if mol.HasProp('chembl_id'): idx = mol.GetProp('chembl_id') elif Chem.INCHI_AVAILABLE: try: Chem.SanitizeMol(mol) idx = Chem.InchiToInchiKey(Chem.MolToInchi(mol)) except: pass if type == 'morgan': fp = rdMolDescriptors.GetMorganFingerprintAsBitVect( mol, radius, nBits=n_bits) elif type == 'pair': fp = Pairs.GetAtomPairFingerprintAsBitVect(mol) elif type == 'maccs': fp = MACCSkeys.GenMACCSKeys(mol) f.write("%s\t%s\n" % (DataStructs.BitVectToFPSText(fp), idx))
def rdk_fingerprint(smi, fp_type="rdkit", size=1024, output="bit"): _fingerprinters = { "rdkit": Chem.rdmolops.RDKFingerprint, "maccs": MACCSkeys.GenMACCSKeys, "TopologicalTorsion": Torsions.GetTopologicalTorsionFingerprint, "Avalon": pyAvalonTools.GetAvalonFP } mol = Chem.MolFromSmiles(smi) if fp_type in _fingerprinters: fingerprinter = _fingerprinters[fp_type] fp = fingerprinter(mol) elif fp_type == "AtomPair": fp = Pairs.GetAtomPairFingerprintAsBitVect(mol, nBits=size) elif fp_type == "Morgan": fp = GetMorganFingerprintAsBitVect(mol, 2, nBits=size) else: raise IOError('invalid fingerprint type') if output == "bit": temp = fp.GetOnBits() res = [i for i in temp] else: res = np.array(fp) return res
def Calc_AtomPairs_Bit(self): pairFps_bit = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in self.sd] return pairFps_bit
def calculate_similarity_vector(smile_pair): """ Calculate fingerprints between two smile terms using different fingerprinters, and use different similarity metrics to calculate the difference between those fingerprints. """ # smile1, smile2 = smile_pair.split('_') smile1, smile2 = smile_pair mol1 = Chem.MolFromSmiles(smile1) mol2 = Chem.MolFromSmiles(smile2) molecule_similarity = list() # RDK topological fingerprint for a molecule fp1 = Chem.RDKFingerprint(mol1) fp2 = Chem.RDKFingerprint(mol2) molecule_similarity.extend(get_similarity_all(fp1, fp2)) #print 'RDK fingerprint: ', DataStructs.KulczynskiSimilarity(fp1,fp2) ## LayeredFingerprint, a fingerprint using SMARTS patterns #fp1 = Chem.LayeredFingerprint(mol1) #fp2 = Chem.LayeredFingerprint(mol2) #print 'RDK fingerprint: ', DataStructs.TanimotoSimilarity(fp1,fp2) # PatternFingerprint, a fingerprint using SMARTS patterns #fp1 = Chem.PatternFingerprint(mol1) #fp2 = Chem.PatternFingerprint(mol2) #print 'RDK fingerprint: ', DataStructs.TanimotoSimilarity(fp1,fp2) ############################################################################### # Topological Fingerprints # Uses Chem.RDKFingerprint internally, but with different parameters, I guess... # http://www.rdkit.org/docs/GettingStartedInPython.html#topological-fingerprints from rdkit.Chem.Fingerprints import FingerprintMols fp1 = FingerprintMols.FingerprintMol(mol1) fp2 = FingerprintMols.FingerprintMol(mol2) molecule_similarity.extend(get_similarity_all(fp1, fp2)) #print 'RDK fingerprint: ', DataStructs.TanimotoSimilarity(fp1,fp2) ############################################################################### # MACCS Keys # There is a SMARTS-based implementation of the 166 public MACCS keys. # http://www.rdkit.org/docs/GettingStartedInPython.html#maccs-keys from rdkit.Chem import MACCSkeys fp1 = MACCSkeys.GenMACCSKeys(mol1) fp2 = MACCSkeys.GenMACCSKeys(mol2) molecule_similarity.extend(get_similarity_all(fp1, fp2)) #print "RDK fingerprint: ", DataStructs.TanimotoSimilarity(fp1,fp2) ############################################################################### # Atom Pairs and Topological Torsions # Atom-pair descriptors [3] are available in several different forms. # The standard form is as fingerprint including counts for each bit instead of just zeros and ones: # http://www.rdkit.org/docs/GettingStartedInPython.html#atom-pairs-and-topological-torsions from rdkit.Chem.AtomPairs import Pairs fp1 = Pairs.GetAtomPairFingerprintAsBitVect(mol1) fp2 = Pairs.GetAtomPairFingerprintAsBitVect(mol2) molecule_similarity.extend(get_similarity_all(fp1, fp2)) #print "RDK fingerprint: ", DataStructs.DiceSimilarity(fp1,fp2) from rdkit.Chem.AtomPairs import Torsions fp1 = Torsions.GetTopologicalTorsionFingerprint(mol1) fp2 = Torsions.GetTopologicalTorsionFingerprint(mol2) molecule_similarity.extend(get_similarity_subset(fp1, fp2)) #print "RDK fingerprint: ", DataStructs.TanimotoSimilarity(fp1,fp2) ############################################################################### # Morgan Fingerprints (Circular Fingerprints) #This family of fingerprints, better known as circular fingerprints [5], #is built by applying the Morgan algorithm to a set of user-supplied atom invariants. #When generating Morgan fingerprints, the radius of the fingerprint must also be provided... # http://www.rdkit.org/docs/GettingStartedInPython.html#morgan-fingerprints-circular-fingerprints from rdkit.Chem import rdMolDescriptors fp1 = rdMolDescriptors.GetMorganFingerprint(mol1, 2) fp2 = rdMolDescriptors.GetMorganFingerprint(mol2, 2) molecule_similarity.extend(get_similarity_subset(fp1, fp2)) fp1 = rdMolDescriptors.GetMorganFingerprint(mol1, 2, useFeatures=True) fp2 = rdMolDescriptors.GetMorganFingerprint(mol2, 2, useFeatures=True) molecule_similarity.extend(get_similarity_subset(fp1, fp2)) #print "RDK fingerprint: ", DataStructs.TanimotoSimilarity(fp1,fp2) ############################################################################### return molecule_similarity
def atom_pair_fp(self): ms = [Chem.MolFromSmiles(i) for i in self.Data.SMILES] fp = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in ms] return fp
def atom_pairs(m): return Pairs.GetAtomPairFingerprintAsBitVect(m)
def FptAtomPairs(rdkmol, fptype='bit'): if fptype.lower() == 'bit': return Pairs.GetAtomPairFingerprintAsBitVect(rdkmol) else: return Pairs.GetAtomPairFingerprint(rdkmol)