def get_similarity_subset(fp1, fp2): """ Get similarity score for fingerprints that are supplied as ExplicitBitVect or some other format. The following similarity metrics work with different intput formats: Tanimoto, Dice """ similarity_scores = [ DataStructs.TanimotoSimilarity(fp1, fp2), DataStructs.DiceSimilarity(fp1, fp2) ] return similarity_scores
def testTorsionValues(self): import base64 testD = ( ('CCCO', b'AQAAAAgAAAD/////DwAAAAEAAAAAAAAAIECAAAMAAAABAAAA\n'), ('CNc1ccco1', b'AQAAAAgAAAD/////DwAAAAkAAAAAAAAAIICkSAEAAAABAAAAKVKgSQEAAAABAAAAKVCgUAEAAAAB\nAAAAKVCgUQEAAAABAAAAKVCkCAIAAAABAAAAKdCkCAIAAAABAAAAKVCgSAMAAAABAAAAKVCkSAMA\nAAABAAAAIICkSAMAAAABAAAA\n' ), ) for smi, txt in testD: pkl = base64.decodestring(txt) fp = rdMD.GetTopologicalTorsionFingerprint(Chem.MolFromSmiles(smi)) fp2 = DataStructs.LongSparseIntVect(pkl) self.assertEqual(DataStructs.DiceSimilarity(fp, fp2), 1.0) self.assertEqual(fp, fp2)
def calculate_dice_similarity_distance(self, i, j): """ TODO Function to calculate the distance between two molecular fingerprints from a list using dice similarity. :param i: :param j: :param fps: :return: :rtype: object """ return 1 - DataStructs.DiceSimilarity(self.fingerprint_list[i], self.fingerprint_list[j])
def test5Dice(self): """ """ v1 = ds.IntSparseIntVect(5) v1[4] = 4 v1[0] = 2 v1[3] = 1 self.assertTrue(feq(ds.DiceSimilarity(v1, v1), 1.0)) v1 = ds.IntSparseIntVect(5) v1[0] = 2 v1[2] = 1 v1[3] = 4 v1[4] = 6 v2 = ds.IntSparseIntVect(5) v2[1] = 2 v2[2] = 3 v2[3] = 4 v2[4] = 4 self.assertTrue(feq(ds.DiceSimilarity(v1, v2), 18.0 / 26.)) self.assertTrue(feq(ds.DiceSimilarity(v2, v1), 18.0 / 26.))
def sim_rdk_topo_fps(smiA, smisT): """ calculate the fingerprint similarity using the RDK atompair fingerprints input are a smiles string and a list of smiles strings returned is a list of similarities """ fp_A = Pairs.GetAtomPairFingerprint(getMolFromSmiles(smiA)) fps_T = [Pairs.GetAtomPairFingerprint(getMolFromSmiles(y)) for y in smisT] sim_vector = [] for t in fps_T: sim_vector.append(DataStructs.DiceSimilarity(fp_A, t)) return sim_vector
def morgan_similarity(smiles_1: List[str], smiles_2: List[str], radius: int, sample_rate: float): """ Determines the similarity between the morgan fingerprints of two lists of smiles strings. :param smiles_1: A list of smiles strings. :param smiles_2: A list of smiles strings. :param radius: The radius of the morgan fingerprints. :param sample_rate: Rate at which to sample pairs of molecules for Morgan similarity (to reduce time). """ # Compute similarities similarities = [] num_pairs = len(smiles_1) * len(smiles_2) # Sample to improve speed if sample_rate < 1.0: sample_num_pairs = sample_rate * num_pairs sample_size = math.ceil(math.sqrt(sample_num_pairs)) sample_smiles_1 = np.random.choice(smiles_1, size=sample_size, replace=True) sample_smiles_2 = np.random.choice(smiles_2, size=sample_size, replace=True) else: sample_smiles_1, sample_smiles_2 = smiles_1, smiles_2 sample_num_pairs = len(sample_smiles_1) * len(sample_smiles_2) for smile_1, smile_2 in tqdm(product(sample_smiles_1, sample_smiles_2), total=sample_num_pairs): mol_1, mol_2 = Chem.MolFromSmiles(smile_1), Chem.MolFromSmiles(smile_2) fp_1, fp_2 = AllChem.GetMorganFingerprint( mol_1, radius), AllChem.GetMorganFingerprint(mol_2, radius) similarity = DataStructs.DiceSimilarity(fp_1, fp_2) similarities.append(similarity) similarities = np.array(similarities) # Print results print() print( f'Average dice similarity = {np.mean(similarities):.4f} +/- {np.std(similarities):.4f}' ) print(f'Minimum dice similarity = {np.min(similarities):.4f}') print(f'Maximum dice similarity = {np.max(similarities):.4f}') print() print('Percentiles for dice similarity') print(' | '.join([ f'{i}% = {np.percentile(similarities, i):.4f}' for i in range(0, 101, 10) ]))
def rd_fingerprint_evaluation(references, candidates): """ Enumerate linear Fragement """ print("Calculating Similarity via RDFIngerprint Path Similarity") similarities = [ [], [], [], [], [] ] # various similarities: Tanimoto, Dice, Cosine, Sokal, McConnaughey for img in references: similarity = [0, 0, 0, 0, 0] if img in candidates: candidate_rdkfingerprint = rdmolops.RDKFingerprint(candidates[img], fpSize=2048, minPath=1, maxPath=7) reference_rdkfingerprint = rdmolops.RDKFingerprint(references[img], fpSize=2048, minPath=1, maxPath=7) similarity[0] = round( DataStructs.TanimotoSimilarity(reference_rdkfingerprint, candidate_rdkfingerprint), 4) similarity[1] = round( DataStructs.DiceSimilarity(reference_rdkfingerprint, candidate_rdkfingerprint), 4) similarity[2] = round( DataStructs.CosineSimilarity(reference_rdkfingerprint, candidate_rdkfingerprint), 4) similarity[3] = round( DataStructs.SokalSimilarity(reference_rdkfingerprint, candidate_rdkfingerprint), 4) similarity[4] = round( DataStructs.McConnaugheySimilarity(reference_rdkfingerprint, candidate_rdkfingerprint), 4) similarities[0].append(similarity[0]) similarities[1].append(similarity[1]) similarities[2].append(similarity[2]) similarities[3].append(similarity[3]) similarities[4].append(similarity[4]) print("Done Calculating Similarity via RDFIngerprint Path Similarity") print("##########################################") print("Tanimoto Similarity:{}".format(round(np.mean(similarities[0]), 4))) print("Dice Similarity:{}".format(round(np.mean(similarities[1]), 4))) print("Cosine Similarity:{}".format(round(np.mean(similarities[2]), 4))) print("Sokal Similarity:{}".format(round(np.mean(similarities[3]), 4))) print("McConnaughey Similarity:{}".format( round(np.mean(similarities[4]), 4))) print("##########################################") return round(np.mean(similarities[0]), 4)
def getSimilarity(self, reference, method='tanimoto', alpha=None, beta=None): if method == 'tanimoto': return DataStructs.TanimotoSimilarity(reference.IFPvector, self.IFPvector) elif method == 'dice': return DataStructs.DiceSimilarity(reference.IFPvector, self.IFPvector) elif method == 'tversky': return DataStructs.TverskySimilarity(reference.IFPvector, self.IFPvector, alpha, beta)
def testPairValues(self): import base64 testD = ( ('CCCO', b'AQAAAAQAAAAAAIAABgAAACGECAABAAAAIoQIAAEAAABBhAgAAQAAACNEGAABAAAAQUQYAAEAAABC\nRBgAAQAAAA==\n' ), ('CNc1ccco1', b'AQAAAAQAAAAAAIAAEAAAACOECgABAAAAJIQKAAIAAABBhQoAAgAAAEKFCgABAAAAIsQKAAEAAABB\nxQoAAQAAAELFCgACAAAAIYQQAAEAAABChRAAAQAAAEOFEAACAAAAYYUQAAEAAAAjhBoAAQAAAEGF\nGgABAAAAQoUaAAIAAABhhRoAAQAAAEKIGgABAAAA\n' ), ) for smi, txt in testD: pkl = base64.decodestring(txt) fp = rdMD.GetAtomPairFingerprint(Chem.MolFromSmiles(smi)) fp2 = DataStructs.IntSparseIntVect(pkl) self.assertEqual(DataStructs.DiceSimilarity(fp, fp2), 1.0) self.assertEqual(fp, fp2)
def ECFP6_fp(mol, rc_names): fp = [AllChem.GetMorganFingerprint(x, 3) for x in mol] tc_df = pd.DataFrame(index=rc_names, columns=rc_names).fillna(0) for c1 in range(len(fp)): tc_df[rc_names[c1]] = [ DataStructs.DiceSimilarity(fp[c1], fp[c2]) for c2 in range(len(fp)) ] clusters = linkage(tc_df.as_matrix(columns=None), "ward") clust_tree = to_tree(clusters, rd=False) d3Dendro = dict(children=[], name=" ") add_node(clust_tree, d3Dendro) label_tree(d3Dendro["children"][0], rc_names) return d3Dendro
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X, simType): if simType == "Topological": fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList] fp = FingerprintMols.FingerprintMol( Chem.MolFromSmiles(predEx[smilesAttrName].value)) elif simType == "Morgan": fpsTrain = [ AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList ] fp = AllChem.GetMorganFingerprint( Chem.MolFromSmiles(predEx[smilesAttrName].value), 2) elif simType == "MACCS": fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList] fp = MACCSkeys.GenMACCSKeys( Chem.MolFromSmiles(predEx[smilesAttrName].value)) else: print "This type of sim is not implemented ", simType simDict = {} idx = 0 simList = [] for ex in train: if simType == "Topological": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) elif simType == "Morgan": sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp) elif simType == "MACCS": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) else: print "This type of sim is not implemented ", simType idx = idx + 1 simDict[ex[nameAttr].value] = sim simList.append(sim) simList.sort(reverse=True) simList = simList[0:X] medSim = round(numpy.median(simList), 3) stdSim = round(numpy.std(simList), 3) minSim = round(min(simList), 3) maxSim = round(max(simList), 3) entropy = round(getRespVar(simList, simDict, train, nameAttr), 3) entropyClosest = round( getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3) return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
def calulate_similarities(ids, radius): ms = [Chem.MolFromSmiles(x) for x in smiles.smiles] fps = [AllChem.GetMorganFingerprint(x, radius) for x in ms] all_features = [] for idx, cid in enumerate(ids): ms_sample = Chem.MolFromSmiles(smiles.loc[cid].smiles) fp_sample = AllChem.GetMorganFingerprint(ms_sample, radius) features = [cid] for fp in fps: features.append(DataStructs.DiceSimilarity(fp, fp_sample)) print(idx, end='\r') all_features.append(features) all_features = pd.DataFrame(all_features) all_features = all_features.set_index(0) all_features.columns = smiles.index return all_features
def sim_rdk_morgan_fps(smiA, smisT): """ calculate the fingerprint similarity using the RDK morgan fingerprints (circular fingerprints) input are a smiles string and a list of smiles strings returned is a list of similarities """ fp_A = rdk.AllChem.GetMorganFingerprint(getMolFromSmiles(smiA), 2) fps_T = [ rdk.AllChem.GetMorganFingerprint(getMolFromSmiles(y), 2) for y in smisT ] sim_vector = [] for t in fps_T: sim_vector.append(DataStructs.DiceSimilarity(fp_A, t)) return sim_vector
def get_similar_compound(condon): com = condon['smiles'] save_img(com, 'static//compound_img//smiles_img.png', 300, 300) output_num = condon['MaxLength'] smiles_file_path = 'data//kegg_smiles2.txt' with open(smiles_file_path) as file: f = file.readlines() smiles_list = [x.split()[1] for x in f] output_num = min(output_num, len(smiles_list)) top_idx = [0] * output_num top_score = [0] * output_num mol1 = Chem.MolFromSmiles(com) if mol1 is None: print('input smiles not exist') return [] mol1 = AllChem.AddHs(mol1) fps1 = AllChem.GetMorganFingerprint(mol1, 2) for i, item in enumerate(smiles_list): mol2 = Chem.MolFromSmiles(item) if mol2 is None: continue mol2 = AllChem.AddHs(mol2) fps2 = AllChem.GetMorganFingerprint(mol2, 2) score = DataStructs.DiceSimilarity(fps1, fps2) score = round(score, 2) if score > min(top_score): min_idx = top_score.index(min(top_score)) top_idx[min_idx] = i top_score[min_idx] = score top_keggid = [f[i].split()[0] for i in top_idx] top_smiles = [f[i].split()[1] for i in top_idx] result = sorted(zip(top_keggid, top_smiles, top_score), key=lambda x: x[2], reverse=True) for i in range(len(result)): result[i] = list(result[i]) result[i].insert(1, compound_dict[result[i][0]][0]) return result
def evaluate_distance(self) -> np.ndarray: """Calculates the euclidean distance between pixels of two different arrays on a vector of observations, and normalizes the result applying the relativize function. In a more general scenario, any function that quantifies the notion of "how different two observations are" could work, even if it is not a proper distance. """ # Get random companion idx = np.random.permutation(np.arange(self.n_walkers, dtype=int)) # Euclidean distance between states (pixels / RAM) dist = [ DataStructs.DiceSimilarity(self.observations[i], self.observations[idx[i]]) for i in range(self.n_walkers) ] dist = 1.0 - np.array(dist) return relativize_vector(dist).astype(np.float32)
def sort_similarity(mols, sort): largest = [0, mols[0]] # Find the most similar molecule by finding largest similarity score for mol in mols: similarity = DataStructs.DiceSimilarity(sort[len(sort) - 1][0], mol[0]) if similarity > largest[0]: largest = [similarity, mol] # Move the molecule from unsorted list to sorted list mols.remove(largest[1]) sort.append(largest[1]) # check if there are more mols to sort if len(mols) > 0: sort_similarity(mols, sort) return sort
def atom_pairs_similarity(active_molecules1, test_molecules): similarity = [] active_molecules_pairfps = [ Pairs.GetAtomPairFingerprint(p) for p in active_molecules1 ] test_molecules_pairsfps = [ Pairs.GetAtomPairFingerprint(p) for p in test_molecules ] for i in range(len(test_molecules_pairsfps)): num_sim = 0 for j in range(len(active_molecules_pairfps)): sim = DataStructs.DiceSimilarity(test_molecules_pairsfps[i], active_molecules_pairfps[j]) if sim > num_sim: num_sim = sim similarity.append(num_sim) return similarity
def ecfp_similarity(active_molecules1, test_molecules): similarity = [] active_molecules_ecfpfps = [ AllChem.GetMorganFingerprint(p, 3) for p in active_molecules1 ] test_molecules_ecfpfps = [ AllChem.GetMorganFingerprint(p, 3) for p in test_molecules ] for i in range(len(test_molecules_ecfpfps)): num_sim = 0 for j in range(len(active_molecules_ecfpfps)): sim = DataStructs.DiceSimilarity(test_molecules_ecfpfps[i], active_molecules_ecfpfps[j]) if sim > num_sim: num_sim = sim similarity.append(num_sim) return similarity
def get_similarity_all(fp1, fp2): """ Get similarity score for fingerprints that are supplied always as SparseBitVect RDKit has the following similarity measures: Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky. """ similarity_scores = [ DataStructs.TanimotoSimilarity(fp1, fp2), DataStructs.DiceSimilarity(fp1, fp2), DataStructs.CosineSimilarity(fp1, fp2), # DataStructs.SokalSimilarity(fp1,fp2), DataStructs.RusselSimilarity(fp1, fp2), DataStructs.KulczynskiSimilarity(fp1, fp2), DataStructs.McConnaugheySimilarity(fp1, fp2) ] return similarity_scores
def getTanDist(outMols): """Get tan dist between all pairs in outMols """ tanDists = [] tanDistsMorgan = [] fps = [FingerprintMols.FingerprintMol(x) for x in outMols] for outIdx in range(len(outMols)): for inIdx in range(outIdx + 1, len(outMols)): print outIdx, inIdx tanDist = DataStructs.FingerprintSimilarity( fps[outIdx], fps[inIdx]) fpsM1 = AllChem.GetMorganFingerprint(outMols[outIdx], 2) fpsM2 = AllChem.GetMorganFingerprint(outMols[inIdx], 2) #tanDistM = DataStructs.TanimotoSimilarity(fpsM1, fpsM2) tanDistM = DataStructs.DiceSimilarity(fpsM1, fpsM2) tanDists.append(round(tanDist, 2)) tanDistsMorgan.append(round(tanDistM, 2)) return tanDists, tanDistsMorgan
def morgan_fingerprint_evaluation(references, candidates): """ Circular based fingerprints https://doi.org/10.1021/ci100050t """ print("Calculating Similarity via Morgan based Circular Fingerprint") similarities = [ [], [], [], [], [] ] # various similarities: Tanimoto, Dice, Cosine, Sokal, McConnaughey for img in references: similarity = [0, 0, 0, 0, 0] if img in candidates: morgan_fp_candidate = AllChem.GetMorganFingerprintAsBitVect( candidates[img], 2, nBits=1024) morgan_fp_reference = AllChem.GetMorganFingerprintAsBitVect( references[img], 2, nBits=1024) similarity[0] = round( DataStructs.TanimotoSimilarity(morgan_fp_reference, morgan_fp_candidate), 4) similarity[1] = round( DataStructs.DiceSimilarity(morgan_fp_reference, morgan_fp_candidate), 4) similarity[2] = round( DataStructs.CosineSimilarity(morgan_fp_reference, morgan_fp_candidate), 4) similarity[3] = round( DataStructs.SokalSimilarity(morgan_fp_reference, morgan_fp_candidate), 4) similarity[4] = round( DataStructs.McConnaugheySimilarity(morgan_fp_reference, morgan_fp_candidate), 4) similarities[0].append(similarity[0]) similarities[1].append(similarity[1]) similarities[2].append(similarity[2]) similarities[3].append(similarity[3]) similarities[4].append(similarity[4]) print("Done Calculating Similarity via Morgan based Circular Fingerprint") print("##########################################") print("Tanimoto Similarity:{}".format(round(np.mean(similarities[0]), 4))) print("Dice Similarity:{}".format(round(np.mean(similarities[1]), 4))) print("Cosine Similarity:{}".format(round(np.mean(similarities[2]), 4))) print("Sokal Similarity:{}".format(round(np.mean(similarities[3]), 4))) print("McConnaughey Similarity:{}".format( round(np.mean(similarities[4]), 4))) print("##########################################") return round(np.mean(similarities[0]), 4)
def similar_molecules(self, mols): """ Returns molecules from `mols` ordered by similarity. The most similar molecule is at index 0. This method uses the Morgan fingerprints of radius 4 to evaluate how similar the molecules in `mols` are. Parameters ---------- mols : :class:`iterable` of :class:`rdkit.Mol` A group of molecules to which similarity is compared. Returns ------- :class:`list` A :class:`list` of the form, .. code-block:: python returned_list = [(8.9, mol1), (7.3, mol2), (3.4, mol3)] where the :class:`float` is the similarity of a given molecule in `mols` while the ```mol`` is corresponding ``rdkit`` molecule. Most similar molecule yielded first. """ # First get the fingerprint of `self`. rdkit.GetSSSR(self.mol) self.mol.UpdatePropertyCache(strict=False) fp = rdkit.GetMorganFingerprint(self.mol, 4) # For every structure file in the database create a rdkit # molecule. Place these in a list. similarities = [] for mol in mols: rdkit.GetSSSR(mol) mol.UpdatePropertyCache(strict=False) mol_fp = rdkit.GetMorganFingerprint(mol, 4) similarity = DataStructs.DiceSimilarity(fp, mol_fp) similarities.append((similarity, mol)) return sorted(similarities, reverse=True, key=lambda x: x[0])
def get_neighbour(self): """ Returns: List of (closest neighbour, similarity) of the generated smiles """ all_neighbours = [] for i in range(len(self.gen)): tmp_fp = self.gen_fps[i] similarity = 0 neighbour = "" for j in range(len(self.training)): tmp_sim = DataStructs.DiceSimilarity(tmp_fp, self.train_fps[j]) if tmp_sim > similarity: similarity = tmp_sim neighbour = self.training[j] all_neighbours.append((similarity, neighbour)) return all_neighbours
def test6BulkDice(self): """ """ sz = 10 nToSet = 5 nVs = 6 import random vs = [] for i in range(nVs): v = ds.IntSparseIntVect(sz) for j in range(nToSet): v[random.randint(0, sz - 1)] = random.randint(1, 10) vs.append(v) baseDs = [ds.DiceSimilarity(vs[0], vs[x]) for x in range(1, nVs)] bulkDs = ds.BulkDiceSimilarity(vs[0], vs[1:]) for i in range(len(baseDs)): self.assertTrue(feq(baseDs[i], bulkDs[i]))
def maacs_fingerprint_evaluation(references, candidates): """ Generate Similarity via MACCSKeys """ print("Calculating Similarity via MACCS Keys") similarities = [ [], [], [], [], [] ] # various similarities: Tanimoto, Dice, Cosine, Sokal, McConnaughey for img in references: similarity = [0, 0, 0, 0, 0] if img in candidates: candidate_maccs = MACCSkeys.GenMACCSKeys(candidates[img]) reference_maccs = MACCSkeys.GenMACCSKeys(references[img]) similarity[0] = round( DataStructs.TanimotoSimilarity(reference_maccs, candidate_maccs), 4) similarity[1] = round( DataStructs.DiceSimilarity(reference_maccs, candidate_maccs), 4) similarity[2] = round( DataStructs.CosineSimilarity(reference_maccs, candidate_maccs), 4) similarity[3] = round( DataStructs.SokalSimilarity(reference_maccs, candidate_maccs), 4) similarity[4] = round( DataStructs.McConnaugheySimilarity(reference_maccs, candidate_maccs), 4) similarities[0].append(similarity[0]) similarities[1].append(similarity[1]) similarities[2].append(similarity[2]) similarities[3].append(similarity[3]) similarities[4].append(similarity[4]) print("Done Calculating Similarity via MACCS Keys") print("##########################################") print("Tanimoto Similarity:{}".format(round(np.mean(similarities[0]), 4))) print("Dice Similarity:{}".format(round(np.mean(similarities[1]), 4))) print("Cosine Similarity:{}".format(round(np.mean(similarities[2]), 4))) print("Sokal Similarity:{}".format(round(np.mean(similarities[3]), 4))) print("McConnaughey Similarity:{}".format( round(np.mean(similarities[4]), 4))) print("##########################################") return round(np.mean(similarities[0]), 4)
def compare_structure(smiles1, smiles2, fp_type="Morgan", sim_type="Dice"): """ Task: Compare structual similarity of two compound based on fingerprints. Parameters: smiles1: str, smiles of the compound 1 smiles2: str, smiles of the compound 2 fp_type: str, type of fingerprints sim_type: str, method for calculating similarity """ if fp_type == "Morgan": getfp = lambda smi: AllChem.GetMorganFingerprint( Chem.MolFromSmiles(smi), 2, useFeatures=False) elif fp_type == "MorganWithFeature": getfp = lambda smi: AllChem.GetMorganFingerprint( Chem.MolFromSmiles(smi), 2, useFeatures=True) elif fp_type == "MACCS": getfp = lambda smi: Chem.MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(smi) ) elif fp_type == "Topological": getfp = lambda smi: FingerprintMols.FingerprintMol( Chem.MolFromSmiles(smi)) elif fp_type == "AtomPairs": getfp = lambda smi: Pairs.GetAtomPairFingerprint( Chem.MolFromSmiles(smi)) try: fp1 = getfp(smiles1) fp2 = getfp(smiles2) if sim_type == "Dice": sim_fp = DataStructs.DiceSimilarity(fp1, fp2) elif sim_type == "Tanimoto": sim_fp = DataStructs.TanimotoSimilarity(fp1, fp2) elif sim_type == "Cosine": sim_fp = DataStructs.CosineSimilarity(fp1, fp2) elif sim_type == "Sokal": sim_fp = DataStructs.SokalSimilarity(fp1, fp2) elif sim_type == "Russel": sim_fp = DataStructs.RusselSimilarity(fp1, fp2) except Exception as e: sim_fp = -1 return sim_fp
def build_deltaFP(reactions): print("Building FPs and writing to CSV..") FP_column = np.arange(0, 256).tolist() FP_column = ["pfp" + str(item) for item in FP_column] PerturbationFingerprints = [ "Perturbation", "Reaction_SMILES", "fullmember1", "fullmember2", "Member_Similarity (Dice)", ] PerturbationFingerprints = [PerturbationFingerprints + FP_column] for reaction_members in reactions: pert = str(reaction_members[0]) # deconstruct reaction smiles back into members: head, sep, tail = reaction_members[1].partition(">>") # take mol object from each member, retain hydrogens and override valency discrepancies member1 = Chem.MolFromSmiles(head, sanitize=False) member2 = Chem.MolFromSmiles(tail, sanitize=False) member1.UpdatePropertyCache(strict=False) member2.UpdatePropertyCache(strict=False) # create bitstring of 256 bits for each member. FP1 = (rdMolDescriptors.GetHashedAtomPairFingerprint(member1, 256)) FP2 = (rdMolDescriptors.GetHashedAtomPairFingerprint(member2, 256)) similarity = DataStructs.DiceSimilarity(FP1, FP2) # subtract and return reaction FP (=deltaFP) as list deltaFP = np.array(list(FP2)) - np.array(list(FP1)) # print("Perturbation FP for " + pert +" (" + str(reaction_members[1]) + ") is:") # print(deltaFP) # join all the data together into one list and append to output: result = reaction_members + ([str(similarity)]) + deltaFP.tolist() PerturbationFingerprints.append(result) # print("##########################################################################") return PerturbationFingerprints
def get_similarity(mols, compounds, fps_morgan): """ Calculate the pairwise molecular similarity Args: mols: list of mol files for the compounds compounds: list of compound unique ids fps_morgan: list of fingerprints for the compounds Returns: lines containing the 'source','target','similarity' information """ total_sim = '' for i in range(len(mols)): ref_fp = fps_morgan[i] for j in range(i + 1, len(mols)): morgan2_sim = DataStructs.DiceSimilarity(ref_fp, fps_morgan[j]) sims = str(compounds[i]) + ',' + str( compounds[j].rstrip()) + ',' + str(morgan2_sim) + '\n' total_sim += sims return total_sim
def orng_sim_rdk_atompair_fps(smile_active, train_instance): """ calculate the fingerprint similarity using the RDK atom pair fingerprints input are a smiles string and a orange data instance returned is a similaritie value """ smilesName = getSMILESAttr(train_instance) if not smilesName: return None smile_train = str(train_instance[smilesName].value) molAct = getMolFromSmiles(smile_active) molTrain = getMolFromSmiles(smile_train) if not molAct: return None if not molTrain: return None fp_A = Pairs.GetAtomPairFingerprint(molAct) fp_T = Pairs.GetAtomPairFingerprint(molTrain) sim = DataStructs.DiceSimilarity(fp_A, fp_T) return sim
def orng_sim_rdk_morgan_features_fps(smile_active, train_instance): """ calculate the fingerprint similarity using the RDK morgan fingerprints (circular fingerprints, FCFP, feature-based invariant) input are a smiles string and a orange data instance returned is a similaritie value """ smilesName = getSMILESAttr(train_instance) if not smilesName: return None smile_train = str(train_instance[smilesName].value) molAct = getMolFromSmiles(smile_active) molTrain = getMolFromSmiles(smile_train) if not molAct: return None if not molTrain: return None fp_A = rdk.AllChem.GetMorganFingerprint(molAct, 2, useFeatures=True) fp_T = rdk.AllChem.GetMorganFingerprint(molTrain, 2, useFeatures=True) sim = DataStructs.DiceSimilarity(fp_A, fp_T) return sim