def fp_tt_std_mp(mol, mol_can, i, nBits, chiral): fp_mol = Torsions.GetHashedTopologicalTorsionFingerprint(mol, nBits=nBits, includeChirality=chiral) if id(mol) == id(mol_can): fp_can = fp_mol else: fp_can = Torsions.GetHashedTopologicalTorsionFingerprint(mol_can, nBits=nBits, includeChirality=chiral) return (i, fp_mol, fp_can)
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) model_data = model_configuration["data"] radius = int( model_configuration["configuration"]["fragments"][0]["size"]) active_molecules_tt = [] for active_molecule in model_data["active"]: molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) tt_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect( molecule, radius) active_molecules_tt.append(tt_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect( test_molecule, radius) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_tt ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def fp_tt_std(mols, nBits, chiral): for i in mols: fp_mol = Torsions.GetHashedTopologicalTorsionFingerprint(mols[i]["mol"], nBits=nBits, includeChirality=chiral) mols[i]["fp"] = fp_mol if id(mols[i]["mol"]) == id(mols[i]["mol_can"]): mols[i]["fp_can"] = fp_mol else: mols[i]["fp_can"] = Torsions.GetHashedTopologicalTorsionFingerprint(mols[i]["mol_can"], nBits=nBits, includeChirality=chiral)
def testGetTopologicalTorsionFingerprintAsIds(self): mol = Chem.MolFromSmiles('C1CCCCN1') tt = Torsions.GetTopologicalTorsionFingerprint(mol) self.assertEqual(tt.GetNonzeroElements(), {4437590049: 2, 8732557345: 2, 4445978657: 2}) tt = Torsions.GetTopologicalTorsionFingerprintAsIds(mol) self.assertEqual( sorted(tt), [4437590049, 4437590049, 4445978657, 4445978657, 8732557345, 8732557345]) tt = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) self.assertEqual(tt.GetNonzeroElements(), {4437590049: 2, 8732557345: 2, 4445978657: 2})
def testTorsionsRegression(self): inF = gzip.open(os.path.join(self.testDataPath, 'mols1000.tts.pkl.gz'), 'rb') torsions = cPickle.load(inF, encoding='bytes') for i, m in enumerate(self.mols): tt = Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) if tt != torsions[i]: print(Chem.MolToSmiles(m)) pd = tt.GetNonzeroElements() rd = torsions[i].GetNonzeroElements() for k, v in pd.iteritems(): if rd.has_key(k): if rd[k] != v: print('>>>1', k, v, rd[k]) else: print('>>>2', k, v) for k, v in rd.iteritems(): if pd.has_key(k): if pd[k] != v: print('>>>3', k, v, pd[k]) else: print('>>>4', k, v) self.assertTrue(tt == torsions[i]) self.assertTrue(tt != torsions[i - 1])
def computeFP(self, typeFP): from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs, Torsions from rdkit.Chem import AllChem if not "smiclean" in self.__dict__: self.log = self.log + "No smiles prepared\n" return 1 else: self.mol = Chem.MolFromSmiles(self.smiclean) #print self.smiclean dFP = {} if typeFP == "Mol" or typeFP == "All": dFP["Mol"] = FingerprintMols.FingerprintMol(self.mol) if typeFP == "MACCS" or typeFP == "All": dFP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol) if typeFP == "pairs" or typeFP == "All": dFP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol) if typeFP == "Torsion" or typeFP == "All": dFP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint( self.mol) if typeFP == "Morgan" or typeFP == "All": dFP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2) self.FP = dFP return 0
def _torsionsFingerprintsClustering(rdkit_mols): """ Returns the dice distance matrix based on torsionsfingerprints method Parameters ---------- rdkit_mols: list The list of rdkit.Chem.rdchem.Mol objects Returns ------- dicematrix: np.array The numpy array containing the dice matrix """ from rdkit.Chem.AtomPairs import Torsions # Topological Torsions fps = [] for m in tqdm(rdkit_mols): fps.append(Torsions.GetHashedTopologicalTorsionFingerprint(m)) aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) dice_matrix = aprun(total=len(fps), desc='TorsionsFingerprints Distance') \ (delayed(DiceDistances)(fp1, fps) for fp1 in fps) return np.array(dice_matrix)
def testTorsionsRegression(self): inF = gzip.open(os.path.join(self.testDataPath, 'mols1000.tts.pkl.gz'), 'rb') torsions = cPickle.load(inF, encoding='bytes') for i, m in enumerate(self.mols): tt = Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) if tt != torsions[i]: # pragma: nocover debugFingerprint(m, tt, torsions[i]) self.assertEqual(tt, torsions[i]) self.assertNotEqual(tt, torsions[i - 1])
def torsions_fp(self): df = pd.read_csv(self.csv_path) smiles_list = df['Smiles'].tolist() fingerprints = [] not_found = [] for i in tqdm(range(len(smiles_list))): try: mol = Chem.MolFromSmiles(smiles_list[i]) fp = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) fp._sumCache = fp.GetTotalVal( ) #Bit vector here will be huge, which is why taking TotalVal() # bits = fp.ToBitString() # bits_array = (np.fromstring(fp.ToBitString(),'u1') - ord('0')) fingerprints.append(fp._sumCache) except: fingerprints.append(np.nan) not_found.append(i) pass df.drop(not_found, axis=0, inplace=True) print('Number of FPs not found: {}'.format(len(not_found))) df.reset_index(drop=True, inplace=True) labelencoder = LabelEncoder() Y = labelencoder.fit_transform(df['Label'].values) Y = Y.reshape(Y.shape[0], 1) print('Output shape: {}'.format(Y.shape)) fp_array = (np.asarray((fingerprints), dtype=object)) X = np.delete(fp_array, not_found, axis=0) X = np.vstack(X).astype(np.float32) print('Input shape: {}'.format(X.shape)) final_array = np.concatenate((X, Y), axis=1) # Removing rows, from final_array, where duplicate FPs are present final_array_slice = final_array[:, 0:(final_array.shape[1] - 1)] _, unq_row_indices = np.unique(final_array_slice, return_index=True, axis=0) final_array_unique = final_array[unq_row_indices] print( 'Number of Duplicate FPs: {}'.format(final_array.shape[0] - final_array_unique.shape[0])) print('Final Numpy array shape: {}'.format(final_array_unique.shape)) print('Type of final array: {}'.format(type(final_array_unique))) final_numpy_array = np.asarray((final_array_unique), dtype=np.float32) return final_numpy_array
def GetMolFingerprint(mol,maxPathLength): FQuery = Chem.MolFromSmarts('F') CF3Query= Chem.MolFromSmarts('[$(C(F)(F)F)]') CF3Rxn = AllChem.ReactionFromSmarts('[*:1]-C(F)(F)F>>[*:1]-F') hasCF3 = mol.HasSubstructMatch(CF3Query) if hasCF3: p = CF3Rxn.RunReactants((mol,))[0][0] Chem.SanitizeMol(p) for nm in mol.GetPropNames(): p.SetProp(nm,mol.GetProp(nm)) mol = p match = mol.GetSubstructMatch(FQuery) fp = Torsions.GetHashedTopologicalTorsionFingerprint(mol,nBits=9192,targetSize=maxPathLength,fromAtoms=match) for i in range(2,maxPathLength): nfp = Torsions.GetHashedTopologicalTorsionFingerprint(mol,nBits=9192,targetSize=i,fromAtoms=match) for bit,v in nfp.GetNonzeroElements().iteritems(): fp[bit] = fp[bit]+v return fp
def GetTorsionFPs(mol, nBits = 2048, binary = True): ''' atompairs fingerprints ''' fp = Torsions.GetHashedTopologicalTorsionFingerprint(mol, nBits = nBits) if binary: arr = np.zeros((0,), dtype=np.bool) else: arr = np.zeros((0,), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, arr) return arr
def getCountInfo(m, fpType): # m = Chem.MolFromSmiles(formula) fp = None if fpType == 'AtomPair' or fpType.lower() == 'atom': fp = Pairs.GetAtomPairFingerprint(m) return fp.GetNonzeroElements() elif fpType.lower() == 'morgan' or fpType.lower() == 'circular': fp = AllChem.GetMorganFingerprint(m, 2) return fp.GetNonzeroElements() elif fpType == 'Topological' or fpType.lower() == 'topo': fp = Torsions.GetTopologicalTorsionFingerprint(m) Dict = fp.GetNonzeroElements() convertedDict = {} for elem in Dict: convertedDict[int(elem)] = Dict[elem] return convertedDict
def Fingerprints(mols, fingerprint): # Indigo fingerprints if fingerprint in indigofps: return [mol.fingerprint(fingerprint) for mol in mols] # RDKit fingerprints if fingerprint in rdkitfps: if fingerprint == "atompair": return [Pairs.GetAtomPairFingerprintAsBitVect(mol) for mol in mols] elif fingerprint == "avalon": return [pyAvalonTools.GetAvalonFP(mol) for mol in mols] elif fingerprint == "daylight": return [Chem.RDKFingerprint(mol, fpSize=2048) for mol in mols] elif fingerprint == "maccs": return [MACCSkeys.GenMACCSKeys(mol) for mol in mols] elif fingerprint == "morgan": return [(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)) for mol in mols] elif fingerprint == "pharm2d": return [ Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory) for mol in mols ] elif fingerprint == "topological": return [FingerprintMols.FingerprintMol(mol) for mol in mols] # RDKit non-bit (integer or float) fingerprints if fingerprint in rdkitnonbitfps: if fingerprint == "sheridan": return [Sheridan.GetBPFingerprint(mol) for mol in mols] elif fingerprint == "topotorsion": return [ Torsions.GetTopologicalTorsionFingerprint(mol) for mol in mols ] # E-state fingerprints if fingerprint in rdkitestatefps: if fingerprint == "estate1": return [Fingerprinter.FingerprintMol(mol)[0] for mol in mols] elif fingerprint == "estate2": return [Fingerprinter.FingerprintMol(mol)[1] for mol in mols] # unknown fingerprint return None
def computeFP(self, typeFP): if not "mol" in self.__dict__: self.log = self.log + "No smiles prepared\n" self.err = 1 else: d_FP = {} if typeFP == "Mol" or typeFP == "All": d_FP["Mol"] = FingerprintMols.FingerprintMol(self.mol) if typeFP == "MACCS" or typeFP == "All": d_FP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol) if typeFP == "pairs" or typeFP == "All": d_FP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol) if typeFP == "Torsion" or typeFP == "All": d_FP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint(self.mol) if typeFP == "Morgan" or typeFP == "All": d_FP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2) self.d_FP = d_FP
def CalculateTopologicalTorsionFingerprint( mol: Chem.Mol, rtype: str = 'countstring', bits: int = 2048) -> Tuple[str, dict, Any]: """Calculate Topological Torsion fingerprints. :param rtype: Type of output, may either be: countstring (default), returns a binary string rdkit, return the native rdkit DataStructs dict, for a dict of bits turned on :param bits: Number of folded bits (ignored if rtype != 'countstring') """ res = Torsions.GetTopologicalTorsionFingerprint(mol) if rtype == 'rdkit': return res counts = res.GetNonzeroElements() if rtype == 'dict': return {f'TopolTorsions_{k}': v for k, v in counts.items()} folded = np.zeros(bits) for k, v in counts.items(): folded[k % bits] += v return ';'.join(folded.tolist())
def CalculateTopologicalTorsionFingerprint(mol): """ ################################################################# Calculate Topological Torsion Fingerprints Usage: result=CalculateTopologicalTorsionFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res = Torsions.GetTopologicalTorsionFingerprint(mol) return res.GetLength(), res.GetNonzeroElements(), res
def TORSIONSfpDataFrame(chempandas, namecol, smicol): """ Torsions-based fingerprints 2048 bits. """ assert chempandas.shape[0] <= MAXLINES molsmitmp = [Chem.MolFromSmiles(x) for x in chempandas.iloc[:, smicol]] i = 0 molsmi = [] for x in molsmitmp: if x is not None: x.SetProp("_Name", chempandas.iloc[i, namecol]) molsmi.append(x) i += 1 # TORSIONS Fingerprints. fps = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molsmi ] fpsmat = np.matrix(fps) df = DataFrame(fpsmat, index=[x.GetProp("_Name") for x in molsmi]) # how to name the col? df['SMILES'] = [Chem.MolToSmiles(x) for x in molsmi] df['CHEMBL'] = df.index return (df)
def fp_torsion(mols, key, nBits, chiral): for i in mols: fp = Torsions.GetHashedTopologicalTorsionFingerprint( mols[i][key], nBits=nBits, includeChirality=chiral) mols[i]["fp"] = fp
FPDICT['fcfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=nbits) FPDICT['fcfc2'] = lambda m: Chem.GetMorganFingerprint(m, 1, useFeatures=True) FPDICT['fcfc4'] = lambda m: Chem.GetMorganFingerprint(m, 2, useFeatures=True) FPDICT['fcfc6'] = lambda m: Chem.GetMorganFingerprint(m, 3, useFeatures=True) FPDICT['lecfp4'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 2, nBits=nbits_long) FPDICT['lecfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 3, nBits=nbits_long) FPDICT['lfcfp4'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 2, useFeatures=True, nBits=nbits_long) FPDICT['lfcfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=nbits_long) FPDICT['maccs'] = lambda m: MACCSkeys.GenMACCSKeys(m) FPDICT['ap'] = lambda m: Pairs.GetAtomPairFingerprint(m) FPDICT['tt'] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) FPDICT['hashap'] = lambda m: Desc.GetHashedAtomPairFingerprintAsBitVect( m, nBits=nbits) FPDICT[ 'hashtt'] = lambda m: Desc.GetHashedTopologicalTorsionFingerprintAsBitVect( m, nBits=nbits) FPDICT['rdk5'] = lambda m: Chem.RDKFingerprint( m, maxPath=5, fpSize=nbits, nBitsPerHash=2) FPDICT['rdk6'] = lambda m: Chem.RDKFingerprint( m, maxPath=6, fpSize=nbits, nBitsPerHash=2) FPDICT['rdk7'] = lambda m: Chem.RDKFingerprint( m, maxPath=7, fpSize=nbits, nBitsPerHash=2) if USE_AVALON: FPDICT['avalon'] = lambda m: pyAv.GetAvalonFP(m, nbits) FPDICT['avalon_l'] = lambda m: pyAv.GetAvalonFP(m, nbits_long)
def ClusterOnFingerprint(filename, mols=None, fingerprint=0, cutoff=0.8, metric='Tanimoto', outMatrix=False): '''Clustering Structure based on Fingerprints in RDKit filename: Smile format file saving molecules. If set to None, use given "mols" mols: Input molecules. No use if set up "filename" cutoff: Cutoff using for Butina Clustering fingerprint: Fingerprint to use: 0 or else: RDKit Topological Fingerprint 1: MACCS Fingerprint 2: Atom Pair Fingerprint (AP) 3: Topological Torsion Fingerprint (TT) 4: Morgan Fingerprint similar to ECFP4 Fingerprint 5: Morgan Fingerprint similar to FCFP4 Fingerprint metric: Available similarity metrics include: Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky. outMatrix: Change output to a similarity matrix Return: Default output "clusters, clusterOut": clusters: Clusters containing molecule number. clusterOut: Molecular Cluster Number in List. ''' from rdkit import DataStructs from rdkit.Chem.Draw import SimilarityMaps from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs, Torsions if filename: suppl = Chem.SmilesMolSupplier(filename) mols=[] for mol in suppl: mols.append(mol) molnums=len(mols) ### Calculate Molecular Fingerprint ## MACCS Fingerprint if fingerprint==1: fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols] ## Atom Pair Fingerprint (AP) elif fingerprint == 2: fps = [Pairs.GetAtomPairFingerprint(mol) for mol in mols] ## Topological Torsion Fingerprint (TT) elif fingerprint == 3: fps = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) for mol in mols] ## Morgan Fingerprint similar to ECFP4 Fingerprint elif fingerprint == 4: fps = [AllChem.GetMorganFingerprint(mol,2) for mol in mols] ## Morgan Fingerprint similar to FCFP4 Fingerprint elif fingerprint == 5: fps = [AllChem.GetMorganFingerprint(mol,2,useFeatures=True) for mol in mols] ## RDKit Topological Fingerprint else: #fingerprint==0: fps = [FingerprintMols.FingerprintMol(mol) for mol in mols] if outMatrix: ### Output the Fingerprint similarity Matrix metricsAvailable={'tanimoto':DataStructs.TanimotoSimilarity,"dice":DataStructs.DiceSimilarity, "cosine": DataStructs.CosineSimilarity, "sokal": DataStructs.SokalSimilarity, "russel": DataStructs.RusselSimilarity, "rogotGoldberg": DataStructs.RogotGoldbergSimilarity, "allbit": DataStructs.AllBitSimilarity, "kulczynski": DataStructs.KulczynskiSimilarity, "mcconnaughey": DataStructs.McConnaugheySimilarity, "asymmetric": DataStructs.AsymmetricSimilarity, "braunblanquet": DataStructs.BraunBlanquetSimilarity} if metric.lower() not in metricsAvailable: print "The given metric is unknown!" metric='Tanimoto' simMetrics=metricsAvailable[metric.lower()] ### Calculate Fingerprint similarity Matrix simdm=[[0.0]*molnums]*molnums for i in range(molnums): simdm[i,i]=1.0 for j in range(i+1,molnums): simdm[i,j]=DataStructs.FingerprintSimilarity(fps[i],fps[j],metric=simMetrics) simdm[j,i]=DataStructs.FingerprintSimilarity(fps[j],fps[i],metric=simMetrics) for i in range(molnums): print for j in range(molnums): print '%3.2f' % simdm[i,j], return simdm else: clusters=ClusterFps(fps, cutoff=1-cutoff, metric='Tanimoto') clusterID=0 clusterOut=[0]*len(mols) for cluster in clusters: clusterID+=1 for idx in cluster: clusterOut[idx]=clusterID ## To depict cluster molecule if False: if len(cluster)>1: print "Cluster: " for idx in cluster: mol2mpl(mols[idx]) return clusters, clusterOut
def fp_torsion_taut(query, nBits, chiral): for i in query: for j in range(len(query[i]["tauts"])): fp = Torsions.GetHashedTopologicalTorsionFingerprint( query[i]["tauts"][j], nBits=nBits, includeChirality=chiral) query[i][f"fp{j}"] = fp
def fp_torsion_mp(mol, i, nBits, chiral): fp = Torsions.GetHashedTopologicalTorsionFingerprint( mol, nBits=nBits, includeChirality=chiral) return (i, fp)
def sim_two_serial(): #Load Data----------------------------------------------------------------------- path1 = input("Path for list 1: ") path2 = input("Path for list 2: ") smis1 = pd.read_csv(path1) smis1 = smis1["smiles"] smis2 = pd.read_csv(path2) smis2 = smis2["smiles"] l1 = len(smis1) l2 = len(smis2) l = l1 * l2 lp = round(l / 20) #Get molecules from smiles----------------------------------------------------------------------- bad1 = [] molecules1 = [] for i, smi in enumerate(smis1): m = Chem.MolFromSmiles(smi) if m is None: print('smile with number:', i, 'in list 1 could not be converted to molecule') bad1.append(i) continue molecules1.append(m) bad2 = [] molecules2 = [] for i, smi in enumerate(smis2): m = Chem.MolFromSmiles(smi) if m is None: print('smile with number:', i, 'in list 2 could not be converted to molecule') bad2.append(i) continue molecules2.append(m) #can1=[Chem.MolToSmiles(x) for x in molecules1] #can2=[Chem.MolToSmiles(x) for x in molecules2] #for j in bad1: #can1.insert(j,"bad1") #for j in bad2: #can2.insert(j,"bad2") smis1 = [] smis2 = [] #Final output matrix----------------------------------------------------------------------- similarity = np.zeros(shape=(l1, l2), dtype=np.float32) from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs from rdkit.Chem.AtomPairs import Torsions from rdkit.Chem import AllChem print('Begining fingerprint calculation...wait') fps_topol1 = [FingerprintMols.FingerprintMol(x) for x in molecules1] fps_maccs1 = [MACCSkeys.GenMACCSKeys(x) for x in molecules1] fps_pairs1 = [Pairs.GetAtomPairFingerprint(x) for x in molecules1] fps_tts1 = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules1 ] fps_ecfp4_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in molecules1 ] fps_ecfp6_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024) for x in molecules1 ] fps_fcfp4_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024, useFeatures=True) for x in molecules1 ] fps_fcfp6_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024, useFeatures=True) for x in molecules1 ] print('Begining fingerprint calculation...50%') fps_topol2 = [FingerprintMols.FingerprintMol(x) for x in molecules2] fps_maccs2 = [MACCSkeys.GenMACCSKeys(x) for x in molecules2] fps_pairs2 = [Pairs.GetAtomPairFingerprint(x) for x in molecules2] fps_tts2 = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules2 ] fps_ecfp4_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in molecules2 ] fps_ecfp6_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024) for x in molecules2 ] fps_fcfp4_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024, useFeatures=True) for x in molecules2 ] fps_fcfp6_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024, useFeatures=True) for x in molecules2 ] print('Begining fingerprint calculation...done\n') for j in bad1: fps_topol1.insert(j, 1) fps_maccs1.insert(j, 1) fps_pairs1.insert(j, 1) fps_tts1.insert(j, 1) fps_ecfp4_1.insert(j, 1) fps_ecfp6_1.insert(j, 1) fps_fcfp4_1.insert(j, 1) fps_fcfp6_1.insert(j, 1) for j in bad2: fps_topol2.insert(j, 1) fps_maccs2.insert(j, 1) fps_pairs2.insert(j, 1) fps_tts2.insert(j, 1) fps_ecfp4_2.insert(j, 1) fps_ecfp6_2.insert(j, 1) fps_fcfp4_2.insert(j, 1) fps_fcfp6_2.insert(j, 1) print('Begining of fingerprints similarity calculation\n') molecules1 = [] molecules2 = [] k = 0 maxs = 2 / (0.65 * 10) + 2 / (0.6 * 10) + 2 / (0.7 * 10) + 1 / ( 0.75 * 5) + 1 / (0.85 * 5) for i in range(l1): for j in range(l2): if not ((i in bad1) or (j in bad2)): similarities_topol = DataStructs.FingerprintSimilarity( fps_topol1[i], fps_topol2[j]) similarities_maccs = DataStructs.FingerprintSimilarity( fps_maccs1[i], fps_maccs2[j]) similarities_pairs = DataStructs.DiceSimilarity( fps_pairs1[i], fps_pairs2[j]) similarities_tts = DataStructs.DiceSimilarity( fps_tts1[i], fps_tts2[j]) similarities_ecfp4 = DataStructs.FingerprintSimilarity( fps_ecfp4_1[i], fps_ecfp4_2[j]) similarities_ecfp6 = DataStructs.FingerprintSimilarity( fps_ecfp6_1[i], fps_ecfp6_2[j]) similarities_fcfp4 = DataStructs.FingerprintSimilarity( fps_fcfp4_1[i], fps_fcfp4_2[j]) similarities_fcfp6 = DataStructs.FingerprintSimilarity( fps_fcfp6_1[i], fps_fcfp6_2[j]) similarity[i][j] = ( 0.5 * (similarities_ecfp4 / 0.65 + similarities_ecfp6 / 0.6) + 0.5 * (similarities_fcfp4 / 0.65 + similarities_fcfp6 / 0.6) + 0.5 * (similarities_tts / 0.7 + similarities_pairs / 0.7) + similarities_maccs / 0.85 + similarities_topol / 0.75) / 5 k = k + 1 if k % lp == 0: print('running:', (k / l) * 100, '%') #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity) similarity = similarity / maxs similarity[bad1, :] = 10 similarity[:, bad2] = 10 print('End of fingerprints similarity calculation') bad1 = [] bad2 = [] df_similarity = pd.DataFrame(similarity) similarity = [] return df_similarity
def sim_one_serial(): #Load Data----------------------------------------------------------------------- path = input("Path for list : ") smis = pd.read_csv(path) smis = smis["smiles"] l = len(smis) lp = round(l * l / 20) #Get molecules from smiles----------------------------------------------------------------------- bad = [] molecules = [] for i, smi in enumerate(smis): m = Chem.MolFromSmiles(smi) if m is None: print('smile with number:', i, 'in list could not be converted to molecule') bad.append(i) continue molecules.append(m) #can=[Chem.MolToSmiles(x) for x in molecules] #for j in bad: #can.insert(j,"bad") smis = [] #Final output matrix----------------------------------------------------------------------- similarity = np.zeros(shape=(l, l), dtype=np.float32) from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs from rdkit.Chem.AtomPairs import Torsions from rdkit.Chem import AllChem print('Begining fingerprint calculation...wait') fps_topol = [FingerprintMols.FingerprintMol(x) for x in molecules] fps_maccs = [MACCSkeys.GenMACCSKeys(x) for x in molecules] fps_pairs = [Pairs.GetAtomPairFingerprint(x) for x in molecules] fps_tts = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules ] fps_ecfp4 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in molecules ] fps_ecfp6 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024) for x in molecules ] fps_fcfp4 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024, useFeatures=True) for x in molecules ] fps_fcfp6 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024, useFeatures=True) for x in molecules ] print('Begining fingerprint calculation...done\n') for j in bad: fps_topol.insert(j, 1) fps_maccs.insert(j, 1) fps_pairs.insert(j, 1) fps_tts.insert(j, 1) fps_ecfp4.insert(j, 1) fps_ecfp6.insert(j, 1) fps_fcfp4.insert(j, 1) fps_fcfp6.insert(j, 1) #molecules=[] print('Begining of fingerprints similarity calculation\n') k = 0 maxs = 2 / (0.65 * 10) + 2 / (0.6 * 10) + 2 / (0.7 * 10) + 1 / ( 0.75 * 5) + 1 / (0.85 * 5) for i in range(l): for j in range(l): if i >= j: if not ((i in bad) or (j in bad)): similarities_topol = DataStructs.FingerprintSimilarity( fps_topol[i], fps_topol[j]) similarities_maccs = DataStructs.FingerprintSimilarity( fps_maccs[i], fps_maccs[j]) similarities_pairs = DataStructs.DiceSimilarity( fps_pairs[i], fps_pairs[j]) similarities_tts = DataStructs.DiceSimilarity( fps_tts[i], fps_tts[j]) similarities_ecfp4 = DataStructs.FingerprintSimilarity( fps_ecfp4[i], fps_ecfp4[j]) similarities_ecfp6 = DataStructs.FingerprintSimilarity( fps_ecfp6[i], fps_ecfp6[j]) similarities_fcfp4 = DataStructs.FingerprintSimilarity( fps_fcfp4[i], fps_fcfp4[j]) similarities_fcfp6 = DataStructs.FingerprintSimilarity( fps_fcfp6[i], fps_fcfp6[j]) similarity[i][j] = ( 0.5 * (similarities_ecfp4 / 0.65 + similarities_ecfp6 / 0.6) + 0.5 * (similarities_fcfp4 / 0.65 + similarities_fcfp6 / 0.6) + 0.5 * (similarities_tts / 0.7 + similarities_pairs / 0.7) + similarities_maccs / 0.85 + similarities_topol / 0.75) / 5 similarity[j][i] = similarity[i][j] k = k + 1 if k % lp == 0: print('running:', (k / (l * l / 2)) * 100, '%') #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity) similarity = similarity / maxs similarity[bad, :] = 10 similarity[:, bad] = 10 print('End of fingerprints similarity calculation') bad = [] df_similarity = pd.DataFrame(similarity) similarity = [] return df_similarity
def fp_torsion_taut_mp(taut, i, k, nBits, chiral): fp = Torsions.GetHashedTopologicalTorsionFingerprint( taut, nBits=nBits, includeChirality=chiral) return (i, fp, k)
def BuildTorsionsFP(mol): from rdkit.Chem.AtomPairs import Torsions fp = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) fp._sumCache = fp.GetTotalVal() return fp
def Calc_Torsions(self): tts = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in self.sd] return tts
'/drug_development/studyRdkit/st_rdcit/img/mol21.jpg' ) pairFps = [Pairs.GetAtomPairFingerprint(x) for x in ms] print(pairFps) # 由于包含在原子对指纹中的位空间很大,因此他们以稀疏的方式存储为字典形式 d = pairFps[-1].GetNonzeroElements() print(d) # {541732: 1, 558113: 2, 558115: 2, 558146: 1, 1606690: 2, 1606721: 2} print(d[541732]) # 1 # 位描述也可以像如下所示展示 de = Pairs.ExplainPairScore(558115) print(de) # (('C', 1, 0), 3, ('C', 2, 0)) # The above means: C with 1 neighbor and 0 pi electrons which is 3 bonds from a C with 2 neighbors and 0 pi electrons # 碳带有一个邻位孤电子和0个π电子,这是因为碳与两个邻位原子和氧原子形成3个化学键。 # # 2.4 拓扑扭曲topological torsions tts = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in ms] d_ds = DataStructs.DiceSimilarity(tts[0], tts[1]) print(d_ds) # 0.16666666666666666 # # 2.5 摩根指纹(圆圈指纹)AllChem.GetMorganFingerprint(mol,2) # 通过将Morgan算法应用于一组用户提供的原子不变式,可以构建这一系列的指纹。生成Morgan指纹时,还必须提供指纹的半径 m1 = Chem.MolFromSmiles('Cc1ccccc1') m2 = Chem.MolFromSmiles('Cc1ncccc1') fp1 = AllChem.GetMorganFingerprint(m1, 2) fp2 = AllChem.GetMorganFingerprint(m2, 2) d_mf = DataStructs.DiceSimilarity(fp1, fp2) print(d_mf) # 0.55 # Morgan指纹像原子对和拓扑扭转一样,默认情况系按使用计数,但有也可以将他们计算为位向量 fp1 = AllChem.GetMorganFingerprintAsBitVect(m1, 2, nBits=1024) fp2 = AllChem.GetMorganFingerprintAsBitVect(m2, 2, nBits=1024)
similarities_pairs[i][j] = 1 if i % 500 == 0: print('running:', i / len(fps_pairs) * 100, '%') # In[ ]: df = pd.DataFrame(similarities_pairs) df.to_csv('similarities_pairs.csv') # ### Topological torsion descriptors # In[ ]: from rdkit.Chem.AtomPairs import Torsions fps_tts = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules ] similarities_tts = np.zeros(shape=((len(fps_tts), len(fps_tts)))) # In[ ]: #compute similarities. Comment this section if only the fingerprints are needed for i in range(len(fps_tts)): for j in range(len(fps_tts)): if i > j: similarities_tts[i][j] = DataStructs.DiceSimilarity( fps_tts[i], fps_tts[j]) #default is the Dice similarity for these fps similarities_tts[j][i] = similarities_tts[i][j] elif i == j: similarities_tts[i][j] = 1
) fpdict["lecfp4"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 2, nBits=longbits ) fpdict["lecfp6"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 3, nBits=longbits ) fpdict["lfcfp4"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 2, useFeatures=True, nBits=longbits ) fpdict["lfcfp6"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=longbits ) fpdict["maccs"] = lambda m: MACCSkeys.GenMACCSKeys(m) fpdict["ap"] = lambda m: Pairs.GetAtomPairFingerprint(m) fpdict["tt"] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) fpdict[ "hashap" ] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( m, nBits=nbits ) fpdict[ "hashap_cas_length" ] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( m, nBits=n_cas_bits ) fpdict[ "hashtt" ] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( m, nBits=nbits )