def molecular_similarity(best, parent_candidates, all=False): """ returns a similarity score (0-1) of best with the closest molecular relative in parent_candidates Parameters ---------- best : object Chromosome object, the current mutated candidate parent_candidates : array parent pool of molecules to compare with best. These are represented by SMILES all : boolean, optional, default = False default behavior is false and the tanimoto similarity score is returned. If True tanimoto, dice, cosine, sokal, kulczynski, and mcconnaughey similarities are returned Returns ---------- similarity_score : float similarity_index : int if all=False the best tanimoto similarity score as well as the index of the closest molecular relative are returned if all=True an array of best scores and indeces of the closest molecular relative are returned """ scores = [] if all: indices = [] metrics = [ DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity, DataStructs.CosineSimilarity, DataStructs.SokalSimilarity, DataStructs.KulczynskiSimilarity, DataStructs.McConnaugheySimilarity ] for j in range(len(metrics)): scores_micro = [] for i in range(len(parent_candidates)): ms = [best.Mol, Chem.MolFromSmiles(parent_candidates[i])] fps = [FingerprintMols.FingerprintMol(x) for x in ms] score = DataStructs.FingerprintSimilarity(fps[0], fps[1], metric=metrics[j]) scores_micro.append(score) scores.append(max(scores_micro)) indices.append(scores_micro.index(max(scores_micro))) return scores, indices else: for i in range(len(parent_candidates)): ms = [best.Mol, Chem.MolFromSmiles(parent_candidates[i])] fps = [FingerprintMols.FingerprintMol(x) for x in ms] score = DataStructs.FingerprintSimilarity(fps[0], fps[1]) scores.append(score) return max(scores), scores.index(max(scores))
def similarityMeasure(fps, neg, mol2): fps2 = Generate.Gen2DFingerprint(mol2, sigFactory) similarityPos = DataStructs.FingerprintSimilarity( fps, fps2, metric=DataStructs.TanimotoSimilarity) similarityNeg = DataStructs.FingerprintSimilarity( neg, fps2, metric=DataStructs.TanimotoSimilarity) # if similarityPos>=0.75: print Chem.MolToSmiles(mol2), similarityPos, similarityNeg return similarityPos, similarityPos - similarityNeg
def mols_similarity(ms_smiles=['CCOC', 'CCO', 'COC']): from rdkit import Chem from rdkit import DataStructs from rdkit.Chem.Fingerprints import FingerprintMols ms = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles] # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')] fps = [FingerprintMols.FingerprintMol(x) for x in ms] print fps[0] print DataStructs.FingerprintSimilarity(fps[0], fps[1]) print DataStructs.FingerprintSimilarity(fps[0], fps[2]) print DataStructs.FingerprintSimilarity(fps[1], fps[2]) print DataStructs.FingerprintSimilarity(fps[0], fps[0])
def get_distance_func(name): if name == 'RDK/T': make_representation = (lambda chem: Chem.RDKFingerprint(chem.mol)) distf = lambda x, y: 1.0 - DataStructs.FingerprintSimilarity(x, y) return (make_representation, distf) elif name == 'GOBI/T': make_representation = lambda chem: Generate.Gen2DFingerprint( chem.mol, Gobbi_Pharm2D.factory) distf = lambda x, y: 1.0 - DataStructs.FingerprintSimilarity(x, y) return (make_representation, distf) else: raise Exception('Unknown similarity measure: %s' % job.sim_measure)
def check_mol_similarity(): from rdkit import Chem from rdkit import DataStructs from rdkit.Chem.Fingerprints import FingerprintMols ms = [ Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC') ] fps = [FingerprintMols.FingerprintMol(x) for x in ms] print fps[0] print DataStructs.FingerprintSimilarity(fps[0], fps[1]) print DataStructs.FingerprintSimilarity(fps[0], fps[2]) print DataStructs.FingerprintSimilarity(fps[1], fps[2]) print DataStructs.FingerprintSimilarity(fps[0], fps[0])
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X, simType): if simType == "Topological": fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList] fp = FingerprintMols.FingerprintMol( Chem.MolFromSmiles(predEx[smilesAttrName].value)) elif simType == "Morgan": fpsTrain = [ AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList ] fp = AllChem.GetMorganFingerprint( Chem.MolFromSmiles(predEx[smilesAttrName].value), 2) elif simType == "MACCS": fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList] fp = MACCSkeys.GenMACCSKeys( Chem.MolFromSmiles(predEx[smilesAttrName].value)) else: print "This type of sim is not implemented ", simType simDict = {} idx = 0 simList = [] for ex in train: if simType == "Topological": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) elif simType == "Morgan": sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp) elif simType == "MACCS": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) else: print "This type of sim is not implemented ", simType idx = idx + 1 simDict[ex[nameAttr].value] = sim simList.append(sim) simList.sort(reverse=True) simList = simList[0:X] medSim = round(numpy.median(simList), 3) stdSim = round(numpy.std(simList), 3) minSim = round(min(simList), 3) maxSim = round(max(simList), 3) entropy = round(getRespVar(simList, simDict, train, nameAttr), 3) entropyClosest = round( getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3) return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
def pipe_sim_filter(stream, query, cutoff=0.8, summary=None, comp_id="pipe_sim_filter"): """Filter for compounds that have a similarity greater or equal than `cutoff` to the `query` Smiles. If the field `FP_b64` (e.g. pre-calculated) is present, this will be used, otherwise the fingerprint will be generated on-the-fly (much slower).""" rec_counter = 0 query_mol = Chem.MolFromSmiles(query) if not query_mol: print("* {} ERROR: could not generate query from SMILES.".format( comp_id)) return None query_fp = FingerprintMols.FingerprintMol(query_mol) for rec in stream: if "mol" not in rec: continue if "FP_b64" in rec: # use the pre-defined fingerprint if it is present in the stream mol_fp = pickle.loads(b64.b64decode(rec["FP_b64"])) else: mol_fp = FingerprintMols.FingerprintMol(rec["mol"]) sim = DataStructs.FingerprintSimilarity(query_fp, mol_fp) if sim >= cutoff: rec_counter += 1 if summary is not None: summary[comp_id] = rec_counter yield rec
def test90BulkDistances(self): """ verify that the base similarity (tanimoto) works using an 18 fp regression panel of pubchem compounds (chosen to have different lengths: 5x2048, 5x1024, 5x512, 3x256) """ from rdkit import DataStructs import cPickle, os from rdkit import RDConfig fps = cPickle.load( file( os.path.join(RDConfig.RDCodeDir, 'DataStructs', 'test_data', 'pubchem_fps.pkl'), 'rb')) dm = cPickle.load( file( os.path.join(RDConfig.RDCodeDir, 'DataStructs', 'test_data', 'pubchem_fps.dm.pkl'), 'rb')) dmIdx = 0 for i in range(len(fps)): nmi, fpi = fps[i] for j in range(i + 1, len(fps)): nmj, fpj = fps[j] sim = DataStructs.FingerprintSimilarity(fpi, fpj) self.failUnless(feq(sim, dm[dmIdx])) dmIdx += 1
def calc_appdom(training_set, out_model_dir): appdom_fps = [] # output_ext = "%s_%s_%d_%d" % (mode, method, int(rand_split), int(rand_state)) for mol in training_set: # fingerprint = Chem.GetMorganFingerprintAsBitVect(mol, 3, nBits=1024) fingerprint = FingerprintMols.FingerprintMol(mol) appdom_fps.append(fingerprint) distances = [] for i in range(0, (len(appdom_fps) - 1)): for j in range(i + 1, len(appdom_fps)): # dist = 1.0 - (DataStructs.TanimotoSimilarity(appdom_fps[i], appdom_fps[j])) dist = 1.0 - (DataStructs.FingerprintSimilarity( appdom_fps[i], appdom_fps[j])) distances.append(dist) distances = np.array(distances) mean_distance = np.mean(distances) dev_distance = np.std(distances) appdom_radius = mean_distance + dev_distance # Write fingerprints of training set and AD radius to pickle files for later prediction runs with open(out_model_dir + "/training-FPs_%s.dat" % output_ext, 'wb') as f: pickle.dump(appdom_fps, f) with open(out_model_dir + "/AD-radius_%s.dat" % output_ext, 'wb') as f: pickle.dump(appdom_radius, f) return appdom_fps, appdom_radius
def TanimotoDistances(fp1, fps): """ Returns the tanimoto row based on fingeprints passed Parameters ---------- fp1: rdkit fingerprint The rdkit fingerprint computed used as reference fps: list The list of the rdkit fingerprint computed Returns ------- tanimotorow: list A list with the tanimoto row similarities """ from rdkit import DataStructs # fingerprint similarity tanimoto_row = [] for fp2 in fps: tani = 1 - DataStructs.FingerprintSimilarity(fp1, fp2) tanimoto_row.append(tani) return tanimoto_row
def tanimoto_similarity_average(generated_synt_valid_set : List[str]) -> float: ''' Calculate the average of the tanimoto simlarity with respect to the RDK Fingerprint among all the generated syntactically valid molecules. Paramters --------- generated_synt_valid_set : list List of syntactically valid molecules generated. Returns ------- sum(fing_mols_generated) / len(fing_mols_generated) Average of tanimoto similarity among all syntactically valid molecules generated. ''' mols_generated = [] fing_mols_generated = [] for smile in generated_synt_valid_set: molecule = Chem.MolFromSmiles(smile) if molecule: mols_generated.append(molecule) fps = [Chem.RDKFingerprint(mol) for mol in mols_generated] for i in range(len(fps)): for j in range(i, len(fps)): sim = DataStructs.FingerprintSimilarity(fps[i], fps[j]) fing_mols_generated.append(sim) return sum(fing_mols_generated) / len(fing_mols_generated)
def getSimilarity(item1, item2): """ Calculates similarity given two tuples of (smiles, fingerprint) Returns a tuple of (smiles1, smiles2, similarity) """ return (item1[0], item2[0], DataStructs.FingerprintSimilarity(item1[1], item2[1]))
def test_ligand_data(target, ligand_name, lig): m1 = Chem.MolFromSmiles(lig._data["smiles"][0]) m1 = Chem.AddHs(m1) m2 = Chem.SDMolSupplier( os.path.join( targets.data_path, targets.get_target_dir(target), "02_ligands", ligand_name, "crd", f"{ligand_name}.sdf", ), removeHs=False, )[0] assert m1.GetNumAtoms() == m2.GetNumAtoms() m1.RemoveAllConformers() m2.RemoveAllConformers() assert pytest.approx(1.0, 1e-9) == DataStructs.FingerprintSimilarity( Chem.RDKFingerprint(m1), Chem.RDKFingerprint(m2)) # assert Chem.MolToMolBlock(m1) == Chem.MolToMolBlock(m2) res = rdFMCS.FindMCS([m1, m2]) assert res.numAtoms == m1.GetNumAtoms() assert res.numBonds == m1.GetNumBonds() m3 = lig.get_molecule() m2 = Molecule.from_rdkit(m2) assert Molecule.are_isomorphic(m2, m3)
def calcTani(i,fin_temp1_ori): print(i) col=[] if fin_temp1_ori =='': pass else: molFile =Chem.MolFromSmiles(fin_temp1_ori) if molFile == None: pass else: finger1 = AllChem.GetMACCSKeysFingerprint(molFile) trigger = 0 for j,fin_temp2_ori in enumerate(fin1): if trigger == 0 and fin_temp1_ori == fin_temp2_ori: result =0 col.append(result) elif molFile == None or fin_temp2_ori =='': result = '0' col.append(result) else: try: molFile2 = Chem.MolFromSmiles(fin_temp2_ori) finger2= AllChem.GetMACCSKeysFingerprint(molFile2) result = DataStructs.FingerprintSimilarity(finger1,finger2) #print(result) col.append(result) except: result = '0' col.append(result) return i,col
def sim_i_j(row_i, row_j): """For two given rows of a dataframe, use the rdkit fingerprints to compute TanimotoSimilarity and return the resulting float""" return DataStructs.FingerprintSimilarity( row_i['Fingerprint'], row_j['Fingerprint'], metric=DataStructs.TanimotoSimilarity)
def add_tanimoto_score(path_found_decoys): found_decoys = pd.read_csv(path_found_decoys, index_col=0) total = len(found_decoys) log(f'Total rows: {total}') tanimotos = [] dec_smiles = dict() act_smiles = dict() for ix, row in found_decoys.iterrows(): if ix % 100000 == 0: log(f'Added tanimoto scores to {ix} rows') try: chembl_smile = act_smiles[row['ligand_id']] except KeyError: chembl_smile = make_fingerprint(row['ligand_smile'], fp='fp2') act_smiles[row['ligand_id']] = chembl_smile try: decoy_smile = dec_smiles[row['zinc']] except KeyError: decoy_smile = make_fingerprint(row['zinc_smile'], fp='fp2') dec_smiles[row['zinc']] = decoy_smile tc = DataStructs.FingerprintSimilarity(chembl_smile, decoy_smile) tanimotos.append(tc) found_decoys = found_decoys.assign(tanimoto=tanimotos) found_decoys.to_csv(path_found_decoys) return found_decoys
def pair_similiar_fcfp4(self, valid_smiles): if len(valid_smiles) < 2: return 0, 0 else: valid_mols = [Chem.MolFromSmiles(i) for i in valid_smiles] valid_fps = [ AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useFeatures=True) for mol in valid_mols ] pair_similiar = [] for i in range(len(valid_fps)): for j in range(i + 1, len(valid_fps)): fp_i = valid_fps[i] fp_j = valid_fps[j] pair_similiar.append( DataStructs.FingerprintSimilarity(fp_i, fp_j)) pair_similiar_numpy = np.array(pair_similiar) very_similiar = pair_similiar_numpy[pair_similiar_numpy > 0.75] very_similiar_rate = very_similiar.shape[0] / len(pair_similiar) mean_pair_similiar = sum(pair_similiar) / len(pair_similiar) return str(very_similiar_rate), str(mean_pair_similiar)
def similarity_search(fps_db, smile): fps_test = RDKFingerprint(MolFromSmiles(smile)) ts = [] for i, s_top in enumerate(fps_db): ts.append(DataStructs.FingerprintSimilarity(s_top, fps_test)) ts = np.array(ts) return ts.mean() # ts.max()
def similarity(mol1, mol2, metric="Tanimoto"): """Compare similarity between ligands. Parameters ---------- mol1 : str The smile code of a molecule mol2 : str The smile code of a molecule metric : str, default = 'Tanimoto' The ligand similarity metric. Options: Tanimoto, Dice, Russel, Cosine. Examples -------- >>> from deepunion import fingerprints >>> fingerprints.similarity("CC(C)OCC", "CCOCC") 0.666666666666666 References ---------- https://www.rdkit.org/docs/GettingStartedInPython.html """ ms = [Chem.MolFromSmiles(mol1), Chem.MolFromSmiles(mol2)] fps = [FingerprintMols.FingerprintMol(x) for x in ms] m = metric_fingerprints(metric) try: return DataStructs.FingerprintSimilarity(fps[0], fps[1], m) except: return 0.0
def check_for_known(all_enz_df, mol_fingerprint, threshold): bad_cols = ['Known', 'Mol', 'Fingerprint'] for col in bad_cols: if col in all_enz_df.columns: all_enz_df.drop(columns=col, inplace=True) else: continue fingerprint = mol_fingerprint input_df = fingerprint_products( all_enz_df) # fingerprint the input dataframe and return it input_df['Known'] = '' # initialize similarity column for index, row in input_df.iterrows(): similarity = DataStructs.FingerprintSimilarity( fingerprint, row['Fingerprint'], metric=DataStructs.TanimotoSimilarity) input_df['Known'].loc[index] = similarity known_df = input_df[input_df['Known'] >= threshold] if len(known_df) > 0: known_df.sort_values(by='Known', ascending=False, inplace=True) result = known_df else: #call to promiscuous search code here result = print('No known enzymes. Beginning promiscuous search.') return result
def calc_similarity(compound_one, compound_two): if compound_one in joint_sim: if compound_two in joint_sim[compound_one]: return joint_sim[compound_one][compound_two] else: joint_sim[compound_one] = dict() if compound_two not in joint_sim: joint_sim[compound_two] = dict() if cg_props[compound_one.lower()]["type"] != cg_props[compound_one.lower()]["type"]: joint_sim[compound_one][compound_two] = 0.0 joint_sim[compound_two][compound_one] = 0.0 return 0.0 from rdkit import DataStructs from rdkit.Chem.Fingerprints import FingerprintMols from rdkit import Chem mol_one = Chem.MolFromSmiles(str(cg_props[compound_one.lower()]["smiles"])) mol_two = Chem.MolFromSmiles(str(cg_props[compound_two.lower()]["smiles"])) fp_1 = FingerprintMols.FingerprintMol(mol_one) fp_2 = FingerprintMols.FingerprintMol(mol_two) similarity = DataStructs.FingerprintSimilarity(fp_1, fp_2) joint_sim[compound_one][compound_two] = similarity joint_sim[compound_two][compound_one] = similarity return similarity
def tanimoto_similarity(self): ms = [ Chem.MolFromSmiles(self.smiles1), Chem.MolFromSmiles(self.smiles2) ] fps = [FingerprintMols.FingerprintMol(x) for x in ms] return DataStructs.FingerprintSimilarity(fps[0], fps[1])
def _initTopN(self): self.topN = TopNContainer.TopNContainer(self.numToGet) for obj in self.dataSource: fp = self.fingerprinter(obj) sim = DataStructs.FingerprintSimilarity(fp, self.probe, self.metric) self.topN.Insert(sim, obj)
def mols_similarity_base_return(ms_smiles_mid, ms_smiles_base, property_of_base=None): """ The results will be returned. A * w = b, A and b will be returned. return A, b, w """ from rdkit import DataStructs [fps_base, fps_mid] = mols_similarity_base_core(ms_smiles_mid, ms_smiles_base) Nb, Nm = len(fps_base), len(fps_mid) A = np.zeros((Nm, Nb)) b = np.zeros(Nb) for (bx, f_b) in enumerate(fps_base): for (mx, f_m) in enumerate(fps_mid): print("Base:{0}, Target:{1}".format(ms_smiles_base.keys()[bx], ms_smiles_mid.keys()[mx])) A[mx, bx] = DataStructs.FingerprintSimilarity(f_b, f_m) print(A[mx, bx]) if property_of_base: b[bx] = property_of_base[bx] print(b[bx]) if property_of_base: print "b is obtained." return A, b else: return A
def __init__(self, moli, molj): """ Inizialization function Parameters ---------- moli : RDKit molecule object the first molecule used to perform the Figureprint calculation molj : RDKit molecule object the second molecule used to perform the Figureprint calculation options : argparse python object the list of user options """ # Set logging level and format logging.basicConfig(format='%(levelname)s:\t%(message)s', level=logging.INFO) # Local pointers to the passed molecules self.moli = moli self.molj = molj if not options.verbose == 'pedantic': lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) self.fps_moli = FingerprintMols.FingerprintMol(self.moli) self.fps_molj = FingerprintMols.FingerprintMol(self.molj) self.fps_tan = DataStructs.FingerprintSimilarity( self.fps_moli, self.fps_molj)
def evaluate_similarity_method(dataset, resultsdir): size = 4 ref_corr = range(size, 0, -1) ref_corr_b = range(0, size) # Setup results dir if not os.path.isdir(resultsdir): os.mkdir(resultsdir) writer = Writer(resultsdir) for i, d in enumerate(get_rdkitmols(dataset)): for fpName, fpCalculator in flib.fpdict.iteritems(): ref_mol = d[0] ref_fp = fpCalculator(ref_mol) ref_nbonds = ref_mol.GetNumBonds() tanimotos = [] adjusted_tanimotos = [] for smol in d[1:]: sfp = fpCalculator(smol) if fpName in ["ap", "tt"] or fpName.startswith("ecfc") or fpName.startswith("fcfc"): tanimoto = DataStructs.DiceSimilarity(ref_fp, sfp) else: tanimoto = DataStructs.FingerprintSimilarity(ref_fp, sfp) tanimotos.append(tanimoto) label = fpName writer.write_result(label, tanimotos, i==0)
def build_similarity_matrix(): with open("/home/jeherr/tensorchem/tmp/all_opt_smiles.txt", "r") as f: lines = f.readlines() opt_smiles = [line.strip("\n") for line in lines] opt_mols = [Chem.MolFromSmiles(smile) for smile in opt_smiles] rarest_elements = [5, 14, 15, 34, 35, 53] keep_idx = [ i for i, mol in enumerate(opt_mols) if any(item in rarest_elements for item in mol.GetAtoms()) ] #opt_less = [mol for mol in opt_mols if mol.GetNumHeavyAtoms() < 25] opt_fps = [Chem.RDKFingerprint(mol) for mol in opt_mols] opt_scores = np.ones((len(opt_mols), len(opt_mols))) for i, fp1 in enumerate(opt_fps): for j, fp2 in enumerate(opt_fps[i + 1:]): score = DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.TanimotoSimilarity) opt_scores[i, i + j + 1] = opt_scores[i + j + 1, i] = score np.save("/home/jeherr/tensorchem/tmp/all_opt_scores.npy", opt_scores) opt_scores = np.square(opt_scores) np.save("/home/jeherr/tensorchem/tmp/opt_scores_squared.npy", opt_scores) return keep_idx
def _mols_similarity_base_r0(ms_smiles_mid, ms_smiles_base): """ Input: dictionary type required such as {nick name: smiles code, ...} """ from rdkit import Chem from rdkit import DataStructs from rdkit.Chem.Fingerprints import FingerprintMols # processing for mid print("Target: " + ms_smiles_mid.keys()) ms_mid = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles_mid.values()] # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')] fps_mid = [FingerprintMols.FingerprintMol(x) for x in ms_mid] # processing for base print("Base: " + ms_smiles_base.keys()) ms_base = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles_base.values()] # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')] fps_base = [FingerprintMols.FingerprintMol(x) for x in ms_base] for (bx, f_b) in enumerate(fps_base): for (dx, f_d) in enumerate(fps_mid): print("Base:{0}, Target:{1}".format(ms_smiles_base.keys()[bx], ms_smiles_mid.keys()[dx])) print(DataStructs.FingerprintSimilarity(f_b, f_d))
def compute_fraggle_similarity_for_subs(inMol, qMol, qSmi, qSubs, tverskyThresh=0.8): qFP = Chem.RDKFingerprint(qMol, **rdkitFpParams) iFP = Chem.RDKFingerprint(inMol, **rdkitFpParams) rdkit_sim = DataStructs.TanimotoSimilarity(qFP, iFP) qm_key = "%s_%s" % (qSubs, qSmi) if qm_key in modified_query_fps: qmMolFp = modified_query_fps[qm_key] else: qmMol = atomContrib(qSubs, qMol, tverskyThresh) qmMolFp = Chem.RDKFingerprint(qmMol, **rdkitFpParams) modified_query_fps[qm_key] = qmMolFp rmMol = atomContrib(qSubs, inMol, tverskyThresh) # wrap in a try, catch try: rmMolFp = Chem.RDKFingerprint(rmMol, **rdkitFpParams) fraggle_sim = max(DataStructs.FingerprintSimilarity(qmMolFp, rmMolFp), rdkit_sim) except Exception: # pragma: nocover sys.stderr.write("Can't generate fp for: %s\n" % (Chem.MolToSmiles(rmMol, True))) fraggle_sim = 0.0 return rdkit_sim, fraggle_sim
def morgan2_fp(SMILES, Library): ms = list() sim = list() y = list() ms = [Chem.MolFromSmiles(i) for i in SMILES] fps_Morgan = [AllChem.GetMorganFingerprintAsBitVect(x, 2) for x in ms] Morgan = [ DataStructs.FingerprintSimilarity(y, x) for x, y in it.combinations(fps_Morgan, 2) ] Morgan.sort() sim = Morgan # estatistical values stat = { "MIN": [round(min(sim), 2)], "1Q": [round(np.percentile(sim, 25))], "MEDIAN": [round(st.median(sim))], "MEAN": [round(st.mean(sim), 2)], "3Q": [round(np.percentile(sim, 75), 2)], "MAX": [max(sim)], "STD": [round(st.stdev(sim), 2)], "Library": [str(Library)], } df = pd.DataFrame.from_dict(stat) fp_result = {"sim": sim, "y": np.arange(1, len(sim) + 1) / len(sim), "df": df} return fp_result