def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) model_data = model_configuration["data"] active_molecules_ap = [] for active_molecule in model_data["active"]: molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) ecfp_fingerprint = Pairs.GetAtomPairFingerprint(molecule) active_molecules_ap.append(ecfp_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = Pairs.GetAtomPairFingerprint( test_molecule) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_ap ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def caculate_similarity_atomPairs(smiles_A, smiles_B): try: m1 = Chem.MolFromSmiles(smiles_A) m2 = Chem.MolFromSmiles(smiles_B) p1 = Pairs.GetAtomPairFingerprint(m1) p2 = Pairs.GetAtomPairFingerprint(m2) similarity_p1_p2 = DataStructs.DiceSimilarity(p1, p2) return round(similarity_p1_p2, 4) except: return -1
def sim_rdk_topo_fps(smiA, smisT): """ calculate the fingerprint similarity using the RDK atompair fingerprints input are a smiles string and a list of smiles strings returned is a list of similarities """ fp_A = Pairs.GetAtomPairFingerprint(getMolFromSmiles(smiA)) fps_T = [Pairs.GetAtomPairFingerprint(getMolFromSmiles(y)) for y in smisT] sim_vector = [] for t in fps_T: sim_vector.append(DataStructs.DiceSimilarity(fp_A, t)) return sim_vector
def atom_pairs(self): ms = np.array([Chem.MolFromSmiles(i) for i in self.data.SMILES]) # compute Atom Pair fp = [ Pairs.GetAtomPairFingerprint( Chem.RemoveHs(x)).GetNonzeroElements() for x in ms ] # obtain all bits present bits_ap = set() for i in fp: bits_ap.update([*i]) # add bits for each molecule bits_ap = sorted(bits_ap) feature_matrix = list() # convert fp to bits for item in fp: vect_rep = np.isin( bits_ap, [*item]) # vect_rep, var that indicates bits presents # identify axis to replace ids_to_update = np.where(vect_rep == True) vect_rep = 1 * vect_rep vect_rep = np.array(vect_rep).astype(int) # replace indices with bict values vect_rep[ids_to_update] = list(item.values()) feature_matrix.append(vect_rep) return feature_matrix
def computeFP(self, typeFP): from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs, Torsions from rdkit.Chem import AllChem if not "smiclean" in self.__dict__: self.log = self.log + "No smiles prepared\n" return 1 else: self.mol = Chem.MolFromSmiles(self.smiclean) #print self.smiclean dFP = {} if typeFP == "Mol" or typeFP == "All": dFP["Mol"] = FingerprintMols.FingerprintMol(self.mol) if typeFP == "MACCS" or typeFP == "All": dFP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol) if typeFP == "pairs" or typeFP == "All": dFP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol) if typeFP == "Torsion" or typeFP == "All": dFP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint( self.mol) if typeFP == "Morgan" or typeFP == "All": dFP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2) self.FP = dFP return 0
def extract_atompair_fragments(molecule: object) -> list: output = [] pairFps = Pairs.GetAtomPairFingerprint(molecule) d = pairFps.GetNonzeroElements() for pair in d: atom1 = rdkit.Chem.AtomFromSmarts(Pairs.ExplainPairScore(pair)[0][0]) atom2 = rdkit.Chem.AtomFromSmarts(Pairs.ExplainPairScore(pair)[2][0]) smiles = (Pairs.ExplainPairScore(pair)[0][0] + Pairs.ExplainPairScore(pair)[2][0]) atom1_type = atom1.GetAtomicNum() atom2_type = atom2.GetAtomicNum() atom1_num_pi_bonds = Pairs.ExplainPairScore(pair)[0][2] atom2_num_pi_bonds = Pairs.ExplainPairScore(pair)[2][2] atom1_num_neigh = Pairs.ExplainPairScore(pair)[0][1] atom2_num_neigh = Pairs.ExplainPairScore(pair)[2][1] atom1_property_value = 64 * atom1_type + 16 * atom1_num_pi_bonds + atom1_num_neigh atom2_property_value = 64 * atom2_type + 16 * atom2_num_pi_bonds + atom2_num_neigh dist = Pairs.ExplainPairScore(pair)[1] + 1 atom_pair_key = min( atom1_property_value, atom2_property_value) + 1024 * ( max(atom1_property_value, atom2_property_value) + 1024 * dist) num = (d[pair]) for i in range(num): output.append({ "smiles": smiles, "index": atom_pair_key, "type": "AP", "size": dist }) return output
def atom_pairs(): """ Atom pair fingerprints, atom descriptor """ # Generate molecules ms = [ Chem.MolFromSmiles('C1CCC1OCC'), Chem.MolFromSmiles('CC(C)OCC'), Chem.MolFromSmiles('CCOCC') ] pairFps = [Pairs.GetAtomPairFingerprint(x) for x in ms] # Get the list of bits and their counts for each fingerprint as a dictionary d = pairFps[-1].GetNonzeroElements() print(d) # Explanation of the bitscore. print(Pairs.ExplainPairScore(558115)) # Dice similarity; The usual metric for similarity between atom-pair fingerprints print(DataStructs.DiceSimilarity(pairFps[0], pairFps[1])) # Atom decriptor without count pairFps = [Pairs.GetAtomPairFingerprintAsBitVect(x) for x in ms] print(DataStructs.DiceSimilarity(pairFps[0], pairFps[1]))
def _atomsFingerprintsClustering(rdkit_mols): """ Returns the dice distance matrix based on atomsfingerprints method Parameters ---------- rdkit_mols: list The list of rdkit.Chem.rdchem.Mol objects Returns ------- dicematrix: np.array The numpy array containing the dice matrix """ from rdkit.Chem.AtomPairs import Pairs # Atom pairs fps = [] for m in tqdm(rdkit_mols): fps.append(Pairs.GetAtomPairFingerprint(m)) aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) dice_matrix = aprun(total=len(fps), desc='AtomsFingerprints Distance') \ (delayed(DiceDistances)(fp1, fps) for fp1 in fps) return np.array(dice_matrix)
def findCluster(self, smiles): mol = Chem.MolFromSmiles(smiles) if mol: try: scaffold = MurckoScaffold.GetScaffoldForMol(mol) except: return "", "", False if scaffold: cluster = Chem.MolToSmiles(scaffold, isomericSmiles=False) else: return "", "", False else: return "", "", False fp = Pairs.GetAtomPairFingerprint(scaffold) # Change to Tanimoto? if cluster in self.getFingerprints(): return cluster, fp, False fps = list(self.getFingerprints().values()) sims = DataStructs.BulkTanimotoSimilarity(fp, fps) if len(sims) == 0: return cluster, fp, True closest = np.argmax(sims) if sims[closest] >= self.minsimilarity: return list(self.getFingerprints().keys())[closest], fp, False else: return cluster, fp, True
def atom_pairs_similarity(active_molecules1, test_molecules): similarity = [] active_molecules_pairfps = [ Pairs.GetAtomPairFingerprint(p) for p in active_molecules1 ] test_molecules_pairsfps = [ Pairs.GetAtomPairFingerprint(p) for p in test_molecules ] for i in range(len(test_molecules_pairsfps)): num_sim = 0 for j in range(len(active_molecules_pairfps)): sim = DataStructs.DiceSimilarity(test_molecules_pairsfps[i], active_molecules_pairfps[j]) if sim > num_sim: num_sim = sim similarity.append(num_sim) return similarity
def testPairsRegression(self): inF = gzip.open(os.path.join(self.testDataPath, 'mols1000.aps.pkl.gz'), 'rb') atomPairs = cPickle.load(inF, encoding='bytes') for i, m in enumerate(self.mols): ap = Pairs.GetAtomPairFingerprint(m) if ap != atomPairs[i]: # pragma: nocover debugFingerprint(m, ap, atomPairs[i]) self.assertEqual(ap, atomPairs[i]) self.assertNotEqual(ap, atomPairs[i - 1])
def orng_sim_rdk_atompair_fps(smile_active, train_instance): """ calculate the fingerprint similarity using the RDK atom pair fingerprints input are a smiles string and a orange data instance returned is a similaritie value """ smilesName = getSMILESAttr(train_instance) if not smilesName: return None smile_train = str(train_instance[smilesName].value) molAct = getMolFromSmiles(smile_active) molTrain = getMolFromSmiles(smile_train) if not molAct: return None if not molTrain: return None fp_A = Pairs.GetAtomPairFingerprint(molAct) fp_T = Pairs.GetAtomPairFingerprint(molTrain) sim = DataStructs.DiceSimilarity(fp_A, fp_T) return sim
def get_similarity(): # get similarities on the first molecule in compound group # precalculate fingerprints for reference compound ref_morgan2 = AllChem.GetMorganFingerprintAsBitVect(mols[0],radius,bit_size) ref_cmorgan2 = AllChem.GetMorganFingerprint(mols[0],radius) ref_fmorgan2 = AllChem.GetMorganFingerprintAsBitVect(mols[0], radius,bit_size, useFeatures = True) ref_ap = Pairs.GetAtomPairFingerprint(mols[0]) # precalculate fingerprints and bit information for test molecules total_sims = '' fps_morgan2 = [] fps_cmorgan2 = [] fps_fmorgan2 = [] fps_ap = [] info_morgan2 = [] info_cmorgan2 = [] info_fmorgan2 = [] num_mols = len(mols) - 1 reference = compounds[0] del compounds[0] del mols[0] #remove reference cmp from list for m in mols: info = {} fps_morgan2.append(AllChem.GetMorganFingerprintAsBitVect(m, radius, bit_size, bitInfo = info)) info_morgan2.append(info) info = {} fps_cmorgan2.append(AllChem.GetMorganFingerprint(m, radius, bitInfo=info)) info_cmorgan2.append(info) info = {} fps_fmorgan2.append(AllChem.GetMorganFingerprintAsBitVect(m, radius, bit_size, useFeatures=True, bitInfo=info)) info_fmorgan2.append(info) fps_ap.append(Pairs.GetAtomPairFingerprint(m)) ## calculate similarities for i,m in enumerate(mols): ap_simil = DataStructs.DiceSimilarity(ref_ap, fps_ap[i]) morgan2_simil = DataStructs.DiceSimilarity(ref_morgan2, fps_morgan2[i]) cmorgan2_simil = DataStructs.DiceSimilarity(ref_cmorgan2, fps_cmorgan2[i]) fmorgan2_simil = DataStructs.DiceSimilarity(ref_fmorgan2, fps_fmorgan2[i]) sims =str(reference)+' '+ str(compounds[i].rstrip())+' '+ str(ap_simil)+' '+str(morgan2_simil)+' '+str(cmorgan2_simil)+' '+str(fmorgan2_simil)+'\n' total_sims += sims return total_sims
def getCountInfo(m, fpType): # m = Chem.MolFromSmiles(formula) fp = None if fpType == 'AtomPair' or fpType.lower() == 'atom': fp = Pairs.GetAtomPairFingerprint(m) return fp.GetNonzeroElements() elif fpType.lower() == 'morgan' or fpType.lower() == 'circular': fp = AllChem.GetMorganFingerprint(m, 2) return fp.GetNonzeroElements() elif fpType == 'Topological' or fpType.lower() == 'topo': fp = Torsions.GetTopologicalTorsionFingerprint(m) Dict = fp.GetNonzeroElements() convertedDict = {} for elem in Dict: convertedDict[int(elem)] = Dict[elem] return convertedDict
def Atompair_fp(mol, rc_names): fp = [Pairs.GetAtomPairFingerprint(x) for x in mol] tc_df = pd.DataFrame(index=rc_names, columns=rc_names).fillna(0) for c1 in range(len(fp)): tc_df[rc_names[c1]] = [ DataStructs.DiceSimilarity(fp[c1], fp[c2]) for c2 in range(len(fp)) ] clusters = linkage(tc_df.as_matrix(columns=None), "ward") clust_tree = to_tree(clusters, rd=False) d3Dendro = dict(children=[], name=" ") add_node(clust_tree, d3Dendro) label_tree(d3Dendro["children"][0], rc_names) return d3Dendro
def computeFP(self, typeFP): if not "mol" in self.__dict__: self.log = self.log + "No smiles prepared\n" self.err = 1 else: d_FP = {} if typeFP == "Mol" or typeFP == "All": d_FP["Mol"] = FingerprintMols.FingerprintMol(self.mol) if typeFP == "MACCS" or typeFP == "All": d_FP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol) if typeFP == "pairs" or typeFP == "All": d_FP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol) if typeFP == "Torsion" or typeFP == "All": d_FP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint(self.mol) if typeFP == "Morgan" or typeFP == "All": d_FP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2) self.d_FP = d_FP
def compare_structure(smiles1, smiles2, fp_type="Morgan", sim_type="Dice"): """ Task: Compare structual similarity of two compound based on fingerprints. Parameters: smiles1: str, smiles of the compound 1 smiles2: str, smiles of the compound 2 fp_type: str, type of fingerprints sim_type: str, method for calculating similarity """ if fp_type == "Morgan": getfp = lambda smi: AllChem.GetMorganFingerprint( Chem.MolFromSmiles(smi), 2, useFeatures=False) elif fp_type == "MorganWithFeature": getfp = lambda smi: AllChem.GetMorganFingerprint( Chem.MolFromSmiles(smi), 2, useFeatures=True) elif fp_type == "MACCS": getfp = lambda smi: Chem.MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smi) ) elif fp_type == "Topological": getfp = lambda smi: FingerprintMols.FingerprintMol( Chem.MolFromSmiles(smi)) elif fp_type == "AtomPairs": getfp = lambda smi: Pairs.GetAtomPairFingerprint( Chem.MolFromSmiles(smi)) try: fp1 = getfp(smiles1) fp2 = getfp(smiles2) if sim_type == "Dice": sim_fp = DataStructs.DiceSimilarity(fp1, fp2) elif sim_type == "Tanimoto": sim_fp = DataStructs.TanimotoSimilarity(fp1, fp2) elif sim_type == "Cosine": sim_fp = DataStructs.CosineSimilarity(fp1, fp2) elif sim_type == "Sokal": sim_fp = DataStructs.SokalSimilarity(fp1, fp2) elif sim_type == "Russel": sim_fp = DataStructs.RusselSimilarity(fp1, fp2) except Exception as e: sim_fp = -1 return sim_fp
def testPairsRegression(self): inF = gzip.open(os.path.join(self.testDataPath,'mols1000.aps.pkl.gz'),'rb') atomPairs = cPickle.load(inF, encoding='bytes') for i,m in enumerate(self.mols): ap = Pairs.GetAtomPairFingerprint(m) #if ap!=atomPairs[i]: # print Chem.MolToSmiles(m) # pd=ap.GetNonzeroElements() # rd=atomPairs[i].GetNonzeroElements() # for k,v in pd.iteritems(): # if rd.has_key(k): # if rd[k]!=v: print '>>>1',k,v,rd[k] # else: # print '>>>2',k,v # for k,v in rd.iteritems(): # if pd.has_key(k): # if pd[k]!=v: print '>>>3',k,v,pd[k] # else: # print '>>>4',k,v self.assertTrue(ap==atomPairs[i]) self.assertTrue(ap!=atomPairs[i-1])
def CalculateAtomPairsFingerprint(mol: Chem.Mol, rtype: str = 'countstring', bits: int = 2048) -> Tuple[str, dict, Any]: """Calculate atom pairs fingerprints. :param rtype: Type of output, may either be: countstring (default), returns a binary string rdkit, return the native rdkit DataStructs dict, for a dict of bits turned on :param bits: Number of folded bits (ignored if rtype != 'countstring') """ res = Pairs.GetAtomPairFingerprint(mol) if rtype == 'rdkit': return res counts = res.GetNonzeroElements() if rtype == 'dict': return {f'AtomPair_{k}': v for k, v in counts.items()} folded = np.zeros(bits) for k, v in counts.items(): folded[k % bits] += v return ';'.join(folded.tolist())
def CalculateAtomPairsFingerprint(mol): """ ################################################################# Calculate atom pairs fingerprints Usage: result=CalculateAtomPairsFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res = Pairs.GetAtomPairFingerprint(mol) return res.GetLength(), res.GetNonzeroElements(), res
def get_smiles_similarity(smiles1, smiles2, similarity="fingerprint"): from rdkit import DataStructs from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem.AtomPairs import Pairs """ fp_type: sim | sub metric: tanimoto | tversky """ if len(smiles1) == 0 or len(smiles2) == 0: return None ms = [Chem.MolFromSmiles(smiles1), Chem.MolFromSmiles(smiles2)] if similarity == "fingerprint": fps = [FingerprintMols.FingerprintMol(x) for x in ms] d = DataStructs.FingerprintSimilarity(fps[0],fps[1],metric=DataStructs.TanimotoSimilarity) elif similarity == "atom": pairFps = [Pairs.GetAtomPairFingerprint(x) for x in ms] d = DataStructs.DiceSimilarity(pairFps[0], pairFps[1]) # d = DataStructs.TanimotoSimilarity(pairFps[0],pairFps[1]) # print(d) return d
similarities_maccs[i][j] = 1 #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity) if i % 500 == 0: print('running:', i / len(fps_maccs) * 100, '%') # In[ ]: df = pd.DataFrame(similarities_maccs) df.to_csv('similarities_maccs.csv') # ### Atom pairs fingerprints # In[ ]: from rdkit.Chem.AtomPairs import Pairs fps_pairs = [Pairs.GetAtomPairFingerprint(x) for x in molecules] similarities_pairs = np.zeros(shape=((len(fps_pairs), len(fps_pairs)))) # In[ ]: #compute similarities. Comment this section if only the fingerprints are needed for i in range(len(fps_pairs)): for j in range(len(fps_pairs)): if i > j: similarities_pairs[i][j] = DataStructs.DiceSimilarity( fps_pairs[i], fps_pairs[j]) #default is the Dice similarity for these fps similarities_pairs[j][i] = similarities_pairs[i][j] elif i == j: similarities_pairs[i][j] = 1 if i % 500 == 0:
m, 2, useFeatures=True, nBits=nbits) FPDICT['fcfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=nbits) FPDICT['fcfc2'] = lambda m: Chem.GetMorganFingerprint(m, 1, useFeatures=True) FPDICT['fcfc4'] = lambda m: Chem.GetMorganFingerprint(m, 2, useFeatures=True) FPDICT['fcfc6'] = lambda m: Chem.GetMorganFingerprint(m, 3, useFeatures=True) FPDICT['lecfp4'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 2, nBits=nbits_long) FPDICT['lecfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 3, nBits=nbits_long) FPDICT['lfcfp4'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 2, useFeatures=True, nBits=nbits_long) FPDICT['lfcfp6'] = lambda m: Chem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=nbits_long) FPDICT['maccs'] = lambda m: MACCSkeys.GenMACCSKeys(m) FPDICT['ap'] = lambda m: Pairs.GetAtomPairFingerprint(m) FPDICT['tt'] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) FPDICT['hashap'] = lambda m: Desc.GetHashedAtomPairFingerprintAsBitVect( m, nBits=nbits) FPDICT[ 'hashtt'] = lambda m: Desc.GetHashedTopologicalTorsionFingerprintAsBitVect( m, nBits=nbits) FPDICT['rdk5'] = lambda m: Chem.RDKFingerprint( m, maxPath=5, fpSize=nbits, nBitsPerHash=2) FPDICT['rdk6'] = lambda m: Chem.RDKFingerprint( m, maxPath=6, fpSize=nbits, nBitsPerHash=2) FPDICT['rdk7'] = lambda m: Chem.RDKFingerprint( m, maxPath=7, fpSize=nbits, nBitsPerHash=2) if USE_AVALON: FPDICT['avalon'] = lambda m: pyAv.GetAvalonFP(m, nbits) FPDICT['avalon_l'] = lambda m: pyAv.GetAvalonFP(m, nbits_long)
def ClusterOnFingerprint(filename, mols=None, fingerprint=0, cutoff=0.8, metric='Tanimoto', outMatrix=False): '''Clustering Structure based on Fingerprints in RDKit filename: Smile format file saving molecules. If set to None, use given "mols" mols: Input molecules. No use if set up "filename" cutoff: Cutoff using for Butina Clustering fingerprint: Fingerprint to use: 0 or else: RDKit Topological Fingerprint 1: MACCS Fingerprint 2: Atom Pair Fingerprint (AP) 3: Topological Torsion Fingerprint (TT) 4: Morgan Fingerprint similar to ECFP4 Fingerprint 5: Morgan Fingerprint similar to FCFP4 Fingerprint metric: Available similarity metrics include: Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky. outMatrix: Change output to a similarity matrix Return: Default output "clusters, clusterOut": clusters: Clusters containing molecule number. clusterOut: Molecular Cluster Number in List. ''' from rdkit import DataStructs from rdkit.Chem.Draw import SimilarityMaps from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs, Torsions if filename: suppl = Chem.SmilesMolSupplier(filename) mols=[] for mol in suppl: mols.append(mol) molnums=len(mols) ### Calculate Molecular Fingerprint ## MACCS Fingerprint if fingerprint==1: fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols] ## Atom Pair Fingerprint (AP) elif fingerprint == 2: fps = [Pairs.GetAtomPairFingerprint(mol) for mol in mols] ## Topological Torsion Fingerprint (TT) elif fingerprint == 3: fps = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) for mol in mols] ## Morgan Fingerprint similar to ECFP4 Fingerprint elif fingerprint == 4: fps = [AllChem.GetMorganFingerprint(mol,2) for mol in mols] ## Morgan Fingerprint similar to FCFP4 Fingerprint elif fingerprint == 5: fps = [AllChem.GetMorganFingerprint(mol,2,useFeatures=True) for mol in mols] ## RDKit Topological Fingerprint else: #fingerprint==0: fps = [FingerprintMols.FingerprintMol(mol) for mol in mols] if outMatrix: ### Output the Fingerprint similarity Matrix metricsAvailable={'tanimoto':DataStructs.TanimotoSimilarity,"dice":DataStructs.DiceSimilarity, "cosine": DataStructs.CosineSimilarity, "sokal": DataStructs.SokalSimilarity, "russel": DataStructs.RusselSimilarity, "rogotGoldberg": DataStructs.RogotGoldbergSimilarity, "allbit": DataStructs.AllBitSimilarity, "kulczynski": DataStructs.KulczynskiSimilarity, "mcconnaughey": DataStructs.McConnaugheySimilarity, "asymmetric": DataStructs.AsymmetricSimilarity, "braunblanquet": DataStructs.BraunBlanquetSimilarity} if metric.lower() not in metricsAvailable: print "The given metric is unknown!" metric='Tanimoto' simMetrics=metricsAvailable[metric.lower()] ### Calculate Fingerprint similarity Matrix simdm=[[0.0]*molnums]*molnums for i in range(molnums): simdm[i,i]=1.0 for j in range(i+1,molnums): simdm[i,j]=DataStructs.FingerprintSimilarity(fps[i],fps[j],metric=simMetrics) simdm[j,i]=DataStructs.FingerprintSimilarity(fps[j],fps[i],metric=simMetrics) for i in range(molnums): print for j in range(molnums): print '%3.2f' % simdm[i,j], return simdm else: clusters=ClusterFps(fps, cutoff=1-cutoff, metric='Tanimoto') clusterID=0 clusterOut=[0]*len(mols) for cluster in clusters: clusterID+=1 for idx in cluster: clusterOut[idx]=clusterID ## To depict cluster molecule if False: if len(cluster)>1: print "Cluster: " for idx in cluster: mol2mpl(mols[idx]) return clusters, clusterOut
def smiles2bob2(smiles): m = chem.MolFromSmiles(smiles) m = chem.AddHs(m) fp = Pairs.GetAtomPairFingerprint(m) return fp.GetNonzeroElements()
def eval_similarity(fp_list, dim, evaluator): s_list = [] for i in range(len(fp_list) - 1): for j in range(i + 1, len(fp_list)): s_list.append(evaluator(fp_list[i][dim], fp_list[j][dim])) s_list = np.array(s_list) return np.mean(s_list), np.std(s_list) if __name__ == '__main__': f = sys.argv[1] fp_func_list = [ lambda x: AllChem.GetMorganFingerprint(x, 2), lambda x: MACCSkeys.GenMACCSKeys(x), lambda x: Pairs.GetAtomPairFingerprint(x), lambda x: FingerprintMols.FingerprintMol(x) ] evaluators = [ lambda x, y: DataStructs.DiceSimilarity(x, y), lambda x, y: DataStructs.FingerprintSimilarity(x, y), lambda x, y: DataStructs.DiceSimilarity(x, y), lambda x, y: DataStructs.FingerprintSimilarity(x, y) ] fp_list = get_fp_list(f, fp_func_list) for i in range(len(fp_func_list)): m, s = eval_similarity(fp_list, i, evaluators[i]) print(1 - m, s)
def smiles2bob(listofsmiles): for smiles in listofsmiles: m = chem.MolFromSmiles(smiles) m = chem.AddHs(m) fp = Pairs.GetAtomPairFingerprint(m) yield fp.GetNonzeroElements()
def sim_two_serial(): #Load Data----------------------------------------------------------------------- path1 = input("Path for list 1: ") path2 = input("Path for list 2: ") smis1 = pd.read_csv(path1) smis1 = smis1["smiles"] smis2 = pd.read_csv(path2) smis2 = smis2["smiles"] l1 = len(smis1) l2 = len(smis2) l = l1 * l2 lp = round(l / 20) #Get molecules from smiles----------------------------------------------------------------------- bad1 = [] molecules1 = [] for i, smi in enumerate(smis1): m = Chem.MolFromSmiles(smi) if m is None: print('smile with number:', i, 'in list 1 could not be converted to molecule') bad1.append(i) continue molecules1.append(m) bad2 = [] molecules2 = [] for i, smi in enumerate(smis2): m = Chem.MolFromSmiles(smi) if m is None: print('smile with number:', i, 'in list 2 could not be converted to molecule') bad2.append(i) continue molecules2.append(m) #can1=[Chem.MolToSmiles(x) for x in molecules1] #can2=[Chem.MolToSmiles(x) for x in molecules2] #for j in bad1: #can1.insert(j,"bad1") #for j in bad2: #can2.insert(j,"bad2") smis1 = [] smis2 = [] #Final output matrix----------------------------------------------------------------------- similarity = np.zeros(shape=(l1, l2), dtype=np.float32) from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs from rdkit.Chem.AtomPairs import Torsions from rdkit.Chem import AllChem print('Begining fingerprint calculation...wait') fps_topol1 = [FingerprintMols.FingerprintMol(x) for x in molecules1] fps_maccs1 = [MACCSkeys.GenMACCSKeys(x) for x in molecules1] fps_pairs1 = [Pairs.GetAtomPairFingerprint(x) for x in molecules1] fps_tts1 = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules1 ] fps_ecfp4_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in molecules1 ] fps_ecfp6_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024) for x in molecules1 ] fps_fcfp4_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024, useFeatures=True) for x in molecules1 ] fps_fcfp6_1 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024, useFeatures=True) for x in molecules1 ] print('Begining fingerprint calculation...50%') fps_topol2 = [FingerprintMols.FingerprintMol(x) for x in molecules2] fps_maccs2 = [MACCSkeys.GenMACCSKeys(x) for x in molecules2] fps_pairs2 = [Pairs.GetAtomPairFingerprint(x) for x in molecules2] fps_tts2 = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules2 ] fps_ecfp4_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in molecules2 ] fps_ecfp6_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024) for x in molecules2 ] fps_fcfp4_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024, useFeatures=True) for x in molecules2 ] fps_fcfp6_2 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024, useFeatures=True) for x in molecules2 ] print('Begining fingerprint calculation...done\n') for j in bad1: fps_topol1.insert(j, 1) fps_maccs1.insert(j, 1) fps_pairs1.insert(j, 1) fps_tts1.insert(j, 1) fps_ecfp4_1.insert(j, 1) fps_ecfp6_1.insert(j, 1) fps_fcfp4_1.insert(j, 1) fps_fcfp6_1.insert(j, 1) for j in bad2: fps_topol2.insert(j, 1) fps_maccs2.insert(j, 1) fps_pairs2.insert(j, 1) fps_tts2.insert(j, 1) fps_ecfp4_2.insert(j, 1) fps_ecfp6_2.insert(j, 1) fps_fcfp4_2.insert(j, 1) fps_fcfp6_2.insert(j, 1) print('Begining of fingerprints similarity calculation\n') molecules1 = [] molecules2 = [] k = 0 maxs = 2 / (0.65 * 10) + 2 / (0.6 * 10) + 2 / (0.7 * 10) + 1 / ( 0.75 * 5) + 1 / (0.85 * 5) for i in range(l1): for j in range(l2): if not ((i in bad1) or (j in bad2)): similarities_topol = DataStructs.FingerprintSimilarity( fps_topol1[i], fps_topol2[j]) similarities_maccs = DataStructs.FingerprintSimilarity( fps_maccs1[i], fps_maccs2[j]) similarities_pairs = DataStructs.DiceSimilarity( fps_pairs1[i], fps_pairs2[j]) similarities_tts = DataStructs.DiceSimilarity( fps_tts1[i], fps_tts2[j]) similarities_ecfp4 = DataStructs.FingerprintSimilarity( fps_ecfp4_1[i], fps_ecfp4_2[j]) similarities_ecfp6 = DataStructs.FingerprintSimilarity( fps_ecfp6_1[i], fps_ecfp6_2[j]) similarities_fcfp4 = DataStructs.FingerprintSimilarity( fps_fcfp4_1[i], fps_fcfp4_2[j]) similarities_fcfp6 = DataStructs.FingerprintSimilarity( fps_fcfp6_1[i], fps_fcfp6_2[j]) similarity[i][j] = ( 0.5 * (similarities_ecfp4 / 0.65 + similarities_ecfp6 / 0.6) + 0.5 * (similarities_fcfp4 / 0.65 + similarities_fcfp6 / 0.6) + 0.5 * (similarities_tts / 0.7 + similarities_pairs / 0.7) + similarities_maccs / 0.85 + similarities_topol / 0.75) / 5 k = k + 1 if k % lp == 0: print('running:', (k / l) * 100, '%') #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity) similarity = similarity / maxs similarity[bad1, :] = 10 similarity[:, bad2] = 10 print('End of fingerprints similarity calculation') bad1 = [] bad2 = [] df_similarity = pd.DataFrame(similarity) similarity = [] return df_similarity
def sim_one_serial(): #Load Data----------------------------------------------------------------------- path = input("Path for list : ") smis = pd.read_csv(path) smis = smis["smiles"] l = len(smis) lp = round(l * l / 20) #Get molecules from smiles----------------------------------------------------------------------- bad = [] molecules = [] for i, smi in enumerate(smis): m = Chem.MolFromSmiles(smi) if m is None: print('smile with number:', i, 'in list could not be converted to molecule') bad.append(i) continue molecules.append(m) #can=[Chem.MolToSmiles(x) for x in molecules] #for j in bad: #can.insert(j,"bad") smis = [] #Final output matrix----------------------------------------------------------------------- similarity = np.zeros(shape=(l, l), dtype=np.float32) from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs from rdkit.Chem.AtomPairs import Torsions from rdkit.Chem import AllChem print('Begining fingerprint calculation...wait') fps_topol = [FingerprintMols.FingerprintMol(x) for x in molecules] fps_maccs = [MACCSkeys.GenMACCSKeys(x) for x in molecules] fps_pairs = [Pairs.GetAtomPairFingerprint(x) for x in molecules] fps_tts = [ Torsions.GetTopologicalTorsionFingerprintAsIntVect(x) for x in molecules ] fps_ecfp4 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024) for x in molecules ] fps_ecfp6 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024) for x in molecules ] fps_fcfp4 = [ AllChem.GetMorganFingerprintAsBitVect(x, 2, nBits=1024, useFeatures=True) for x in molecules ] fps_fcfp6 = [ AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=1024, useFeatures=True) for x in molecules ] print('Begining fingerprint calculation...done\n') for j in bad: fps_topol.insert(j, 1) fps_maccs.insert(j, 1) fps_pairs.insert(j, 1) fps_tts.insert(j, 1) fps_ecfp4.insert(j, 1) fps_ecfp6.insert(j, 1) fps_fcfp4.insert(j, 1) fps_fcfp6.insert(j, 1) #molecules=[] print('Begining of fingerprints similarity calculation\n') k = 0 maxs = 2 / (0.65 * 10) + 2 / (0.6 * 10) + 2 / (0.7 * 10) + 1 / ( 0.75 * 5) + 1 / (0.85 * 5) for i in range(l): for j in range(l): if i >= j: if not ((i in bad) or (j in bad)): similarities_topol = DataStructs.FingerprintSimilarity( fps_topol[i], fps_topol[j]) similarities_maccs = DataStructs.FingerprintSimilarity( fps_maccs[i], fps_maccs[j]) similarities_pairs = DataStructs.DiceSimilarity( fps_pairs[i], fps_pairs[j]) similarities_tts = DataStructs.DiceSimilarity( fps_tts[i], fps_tts[j]) similarities_ecfp4 = DataStructs.FingerprintSimilarity( fps_ecfp4[i], fps_ecfp4[j]) similarities_ecfp6 = DataStructs.FingerprintSimilarity( fps_ecfp6[i], fps_ecfp6[j]) similarities_fcfp4 = DataStructs.FingerprintSimilarity( fps_fcfp4[i], fps_fcfp4[j]) similarities_fcfp6 = DataStructs.FingerprintSimilarity( fps_fcfp6[i], fps_fcfp6[j]) similarity[i][j] = ( 0.5 * (similarities_ecfp4 / 0.65 + similarities_ecfp6 / 0.6) + 0.5 * (similarities_fcfp4 / 0.65 + similarities_fcfp6 / 0.6) + 0.5 * (similarities_tts / 0.7 + similarities_pairs / 0.7) + similarities_maccs / 0.85 + similarities_topol / 0.75) / 5 similarity[j][i] = similarity[i][j] k = k + 1 if k % lp == 0: print('running:', (k / (l * l / 2)) * 100, '%') #for other similarity metrics use for example DataStructs.FingerprintSimilarity(fps[0],fps[1], metric=DataStructs.DiceSimilarity) similarity = similarity / maxs similarity[bad, :] = 10 similarity[:, bad] = 10 print('End of fingerprints similarity calculation') bad = [] df_similarity = pd.DataFrame(similarity) similarity = [] return df_similarity
m, 3, useFeatures=True ) fpdict["lecfp4"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 2, nBits=longbits ) fpdict["lecfp6"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 3, nBits=longbits ) fpdict["lfcfp4"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 2, useFeatures=True, nBits=longbits ) fpdict["lfcfp6"] = lambda m: AllChem.GetMorganFingerprintAsBitVect( m, 3, useFeatures=True, nBits=longbits ) fpdict["maccs"] = lambda m: MACCSkeys.GenMACCSKeys(m) fpdict["ap"] = lambda m: Pairs.GetAtomPairFingerprint(m) fpdict["tt"] = lambda m: Torsions.GetTopologicalTorsionFingerprintAsIntVect(m) fpdict[ "hashap" ] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( m, nBits=nbits ) fpdict[ "hashap_cas_length" ] = lambda m: rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect( m, nBits=n_cas_bits ) fpdict[ "hashtt" ] = lambda m: rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect( m, nBits=nbits