def similarity(a, b):  # Tanimoto similarity
    if a is None or b is None:
        return 0.0
    amol = Chem.MolFromSmiles(a)
    bmol = Chem.MolFromSmiles(b)
    if amol is None or bmol is None:
        return 0.0
    fp1 = AllChem.GetMorganFingerprintAsBitVect(amol,
                                                2,
                                                nBits=2048,
                                                useChirality=False)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol,
                                                2,
                                                nBits=2048,
                                                useChirality=False)

    print(
        '-----------test1: confirm jaccard_score is performing same way as the DataStructs.TanimotoSimilarity-----------'
    )
    a, b = np.array(fp1), np.array(fp2)
    j = jaccard_score(a, b)
    print('jaccard score is {}'.format(j))
    t = DataStructs.TanimotoSimilarity(fp1, fp2)
    print('TanimotoSimilarity is {}'.format(t))
    assert abs(
        j - t) <= 1e-10, 'j and t should be similar, j is {}, t is {}'.format(
            j, t)

    return DataStructs.TanimotoSimilarity(fp1, fp2)
Пример #2
0
 def __call__(self, smile):
     mol = Chem.MolFromSmiles(smile)
     if mol:
         try:
             fp = AllChem.GetMorganFingerprintAsBitVect(mol, 4)
             fp_4 = np.array(fp).reshape(1, -1)
             score = self.clf.predict(fp_4)[0]
             score *= Chem.AddHs(mol).HasSubstructMatch(
                 Chem.MolFromSmarts(self.smarts))
             if max([
                     DataStructs.TanimotoSimilarity(query_fp, fp)
                     for query_fp in self.test_fps
             ]) > 0.99:
                 print("Found original molecule: " + smile)
             if max([
                     DataStructs.TanimotoSimilarity(query_fp, fp)
                     for query_fp in self.train_fps
             ]) > 0.99:
                 score = 0
             if score > 7.5:
                 score = 1
             else:
                 score *= 1 / 7.5
             return float(score)
         except:
             return 0.0
     return 0.0
Пример #3
0
  def _bulkTest(self,bvs):
    for metric in 'Tanimoto','Dice','AllBit','OnBit','RogotGoldberg':
      bulk = getattr(DataStructs,f'Bulk{metric}Similarity')
      single = getattr(DataStructs,f'{metric}Similarity')
    sims = bulk(bvs[0],bvs)
    for i in range(len(bvs)):
      sim = single(bvs[0],bvs[i])
      self.assertEqual(sim,sims[i])
      self.assertEqual(sim, single(bvs[0],bvs[i].ToBinary()))
    dists = bulk(bvs[0], bvs, returnDistance=True)
    for i in range(len(bvs)):
      dist = single(bvs[0], bvs[i], returnDistance=True)
      self.assertEqual(dist, dists[i])
      self.assertEqual(dist, single(bvs[0], bvs[i].ToBinary(), returnDistance=True))

    sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1)
    for i in range(len(bvs)):
      sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1)
      self.assertEqual(sim, sims[i])
      sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
      self.assertEqual(sim, sims[i])

    sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1, returnDistance=True)
    for i in range(len(bvs)):
      sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1, returnDistance=True)
      self.assertEqual(sim, sims[i])
      sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i], returnDistance=True)
      self.assertEqual(sim, sims[i])
Пример #4
0
  def testBitVectorLeader1(self):
    # threshold tests
    fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data',
                         'chembl_cyps.head.fps')
    fps = []
    with open(fname) as infil:
      for line in infil:
        fp = DataStructs.CreateFromFPSText(line.strip())
        fps.append(fp)
    mmp = rdSimDivPickers.LeaderPicker()
    thresh = 0.8
    ids = mmp.LazyBitVectorPick(fps, len(fps), thresh)
    self.assertEqual(len(ids), 146)
    for i in range(len(ids)):
      for j in range(i):
        self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
                                thresh)
    thresh = 0.9
    ids = mmp.LazyBitVectorPick(fps, len(fps), thresh)
    self.assertEqual(len(ids), 14)
    for i in range(len(ids)):
      for j in range(i):
        self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
                                thresh)

    ids = mmp.LazyBitVectorPick(fps, len(fps), thresh, pickSize=10)
    self.assertEqual(len(ids), 10)
    for i in range(len(ids)):
      for j in range(i):
        self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]),
                                thresh)
Пример #5
0
    def optimize(self, smiles, sim_cutoff, lr=2.0, num_iter=20):
        mol_tree = MolTree(smiles)
        mol_tree.recover()
        _, tree_vec, mol_vec = self.encode([mol_tree])

        mol = Chem.MolFromSmiles(smiles)
        fp1 = AllChem.GetMorganFingerprint(mol, 2)

        tree_mean = self.T_mean(tree_vec)
        # Following Mueller et al.
        tree_log_var = -torch.abs(self.T_var(tree_vec))
        mol_mean = self.G_mean(mol_vec)
        # Following Mueller et al.
        mol_log_var = -torch.abs(self.G_var(mol_vec))
        mean = torch.cat([tree_mean, mol_mean], dim=1)
        log_var = torch.cat([tree_log_var, mol_log_var], dim=1)
        cur_vec = create_var(mean.data, True)

        visited = []
        for _ in xrange(num_iter):
            prop_val = self.propNN(cur_vec).squeeze()
            grad = torch.autograd.grad(prop_val, cur_vec)[0]
            cur_vec = cur_vec.data + lr * grad.data
            cur_vec = create_var(cur_vec, True)
            visited.append(cur_vec)

        l, r = 0, num_iter - 1
        while l < r - 1:
            mid = (l + r) / 2
            new_vec = visited[mid]
            tree_vec, mol_vec = torch.chunk(new_vec, 2, dim=1)
            new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False)
            if new_smiles is None:
                r = mid - 1
                continue

            new_mol = Chem.MolFromSmiles(new_smiles)
            fp2 = AllChem.GetMorganFingerprint(new_mol, 2)
            sim = DataStructs.TanimotoSimilarity(fp1, fp2)
            if sim < sim_cutoff:
                r = mid - 1
            else:
                l = mid

        tree_vec, mol_vec = torch.chunk(visited[l], 2, dim=1)
        new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False)
        if new_smiles is None:
            return smiles, 1.0
        new_mol = Chem.MolFromSmiles(new_smiles)
        fp2 = AllChem.GetMorganFingerprint(new_mol, 2)
        sim = DataStructs.TanimotoSimilarity(fp1, fp2)
        if sim >= sim_cutoff:
            return new_smiles, sim
        else:
            return smiles, 1.0
Пример #6
0
def compareAll(fpA_dict, fpB_dict=None, cutoff=None):

    #################################
    ### Get compound similarities ###
    #################################

    simD = {}
    namesA = list(fpA_dict.keys())
    nA = len(namesA)

    # Work with only one input file
    if fpB is None:
        for i in range(nA):
            name1 = namesA[i]
            simD[name1] = {}
            [fp1, smiles1] = fpA_dict[name1]
            for j in range(i + 1, nA):
                name2 = namesA[j]
                [fp2, smiles2] = fpA_dict[name2]
                sim = DataStructs.TanimotoSimilarity(fp1, fp2)
                if cutoff is not None and sim < cutoff:
                    simD[name1][name2] = None
                    simD[name2][name1] = None
                else:
                    simD[name1][name2] = [smiles1, smiles2, sim]
                    simD[name2][name1] = [smiles2, smiles1, sim]

    # Work with two input files
    else:
        namesB = list(fpB_dict.keys())

        for nameA in namesA:
            simD[nameA] = {}
            [fpA, smilesA] = fpA_dict[nameA]
            for nameB in namesB:
                [fpB, smilesB] = fpB_dict[nameB]
                sim = DataStructs.TanimotoSimilarity(
                    fpA, fpB)  #DataStructs.DiceSimilarity(fp1, fp2)

                if cutoff is not None and sim < cutoff:
                    simD[nameA][nameB] = None
                else:
                    simD[nameA][nameB] = [smilesA, smilesB, sim]

    # Remove compounds with no neighbours over the cutoff
    d = {x: simD[x] for x in simD if simD[x] is not None}

    # Convert dictionary to pandas dataframe
    df = pd.DataFrame.from_records([[i, j] + d[i][j] for i in d for j in d[i]])
    df.columns = ['cmpd1', 'cmpd2', 'smiles1', 'smiles2', 'similarity']

    return df
Пример #7
0
    def test10BulkOps2(self):
        nbits = 10000
        bvs = []
        for bvi in range(10):
            bv = DataStructs.ExplicitBitVect(nbits)
            for j in range(nbits):
                x = random.randrange(0, nbits)
                bv.SetBit(x)
            bvs.append(bv)
        bvs = tuple(bvs)
        sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.DiceSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1)
        for i in range(len(bvs)):
            sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1)
            self.failUnless(feq(sim, sims[i]))
            sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5)
        for i in range(len(bvs)):
            sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5)
            self.failUnless(feq(sim, sims[i]))
            sim = DataStructs.DiceSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))
Пример #8
0
    def run(self):
        # if self.combox.currentText() == self.select_mol:
        #     QMessageBox.information(self,'错误','运行已完成')

        smile = self.dff[self.select_mol]
        fp = self.get_fingerprint(smile)
        fps = [
            self.get_fingerprint(smile)
            if self.get_fingerprint(smile) is not None else 0
            for smile in self.db['SMILES']
        ]
        self.db['similar'] = [
            0 if i == 0 else round(DataStructs.TanimotoSimilarity(fp, i), 2)
            for i in fps
        ]
        top20 = self.db[['db', 'plate', 'Col', 'Row', 'similar',
                         'MOLENAME']].sort_values('similar',
                                                  ascending=False).head(20)
        # top20 = sorted(x,key = lambda i: i[1], reverse = True)[:20]
        # df = self.db[['db','plate','Col','Row','MOLENAME']].iloc[[i for i,j in top20],]
        res = '\n'.join([
            '\t'.join([str(i) for i in line])
            for index, line in top20.iterrows()
        ])
        self.res_signal.emit(res)
def similarity(a, b, chiral=True):
    if a is None or b is None: return 0.0
    amol = Chem.MolFromSmiles(a)
    bmol = Chem.MolFromSmiles(b)
    fp1 = AllChem.GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=chiral)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=chiral)
    return DataStructs.TanimotoSimilarity(fp1, fp2) 
Пример #10
0
 def sim_filter(self, query, cutoff=0.75):
     if len(self.fp_name) == 0 or self.fp_col not in self.data.keys():
         raise KeyError("No fingerprints found. Please generate them first with add_fp().")
     data_len = len(self.data)
     show_prog = IPYTHON and data_len > 5000
     if show_prog:
         ctr = nbt.ProgCtr()
         pb = nbt.Progressbar()
     if isinstance(query, str):
         query_mol = Chem.MolFromSmiles(query)
     else:
         query_mol = deepcopy(query)
     if not query_mol:
         raise ValueError("Could not generate query mol.")
     fp_method = FPDICT[self.fp_name]
     query_fp = fp_method(query_mol)
     res_l = []
     for _, rec in self.data.iterrows():
         if show_prog:
             ctr.inc()
             pb.update(100 * ctr() / data_len)
         mol_fp = pickle.loads(b64.b64decode(rec[self.fp_col]))
         sim = DataStructs.TanimotoSimilarity(query_fp, mol_fp)
         if sim >= cutoff:
             rec["Sim"] = sim
             res_l.append(rec)
     result = self.new()
     result.data = pd.DataFrame(res_l)
     print_log(result.data, "sim_filter")
     if show_prog:
         pb.done()
     return result
Пример #11
0
def create_tanimoto_column(smiles_A, smiles_B):
    df_smiles = pd.DataFrame({'A': smiles_A, 'B': smiles_B})
    df_smiles = df_smiles.iloc[np.logical_and(df_smiles['A'].values != 'nan',
                                              df_smiles['B'].values != 'nan')]

    df_smiles.dropna(inplace=True)
    smiles_A = df_smiles.A
    smiles_B = df_smiles.B

    smiles_A_mol = [Chem.MolFromSmiles(x) for x in smiles_A]
    smiles_B_mol = [Chem.MolFromSmiles(x) for x in smiles_B]

    smiles_A_fps = [
        AllChem.GetMorganFingerprint(mol, 2) for mol in smiles_A_mol
    ]
    smiles_B_fps = [
        AllChem.GetMorganFingerprint(mol, 2) for mol in smiles_B_mol
    ]

    tanimoto = np.array([
        DataStructs.TanimotoSimilarity(fp1, fp2)
        for (fp1, fp2) in zip(smiles_A_fps, smiles_B_fps)
    ])

    return tanimoto
Пример #12
0
    def similar_smiles(self, peptide_to_match):
        """
        Calculate similarity but using SMILES representations of the peptides
        
        Arguments:
        peptide_to_match -- peptide sequence that will be compared
        
        Return:
        SMILES similarity based on Morgan Fingerprints and Tanimoto coefficient
        """

        # Generate molecule from sequence
        mol1 = Chem.MolFromSmiles(self.smiles)
        mol1.SetProp("_Name", self.sequence)

        connect_smiles = 'O'
        for res in peptide_to_match:
            connect_smiles = connect_smiles[:-1]
            smiles = aminoacidSMILES(res)
            connect_smiles = connect_smiles + smiles

        mol2 = Chem.MolFromSmiles(connect_smiles)
        mol2.SetProp("_Name", peptide_to_match)

        # Calculate the fingerprints and the similarity
        fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, 2048)
        fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, 2048)

        self.smiles_similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
Пример #13
0
def doSimilarityWeightedAdAnalysis(model_name):
    global rdkit_mols, ad_settings
    ad_idx = []
    known = []
    ad_data = getAdData(model_name)
    required_threshold = np.percentile(ad_data[:, 5], ad_settings)
    for mol_idx, m in enumerate(rdkit_mols):
        ad_flag = True
        #only check for known compounds if set in options (True means check)
        if options.known: k_flag = True
        else: k_flag = False
        for training_instance in ad_data:
            sim = DataStructs.TanimotoSimilarity(m, training_instance[0])
            #check if input=train & need to check input=train
            if sim == 1.0 and k_flag == True:
                known.append([mol_idx, training_instance[1]])
                k_flag = False
            weight = sim / (training_instance[2] * training_instance[3])
            #if comp in AD & no comp already in AD
            if weight >= required_threshold and ad_flag == True:
                ad_idx.append(mol_idx)
                ad_flag = False
            #if compound is in AD and no need to check accross all comps for known then break
            if k_flag == False and ad_flag == False: break
    return ad_idx, np.array(known)
 def score_model(self, model_configuration: dict, fragments_file: str,
                 descriptors_file: str, output_file: str):
     inputoutput_utils.create_parent_directory(output_file)
     model_data = model_configuration["data"]
     radius = int(
         model_configuration["configuration"]["fragments"][0]["size"])
     active_molecules_tt = []
     for active_molecule in model_data["active"]:
         molecule_smiles = active_molecule.strip("\"")
         molecule = Chem.MolFromSmiles(molecule_smiles)
         tt_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect(
             molecule, radius)
         active_molecules_tt.append(tt_fingerprint)
     first_line = True
     with open(output_file, "w", encoding="utf-8") as output_stream:
         with open(fragments_file, "r", encoding="utf-8") as input_stream:
             for new_line in input_stream:
                 line = json.loads(new_line)
                 test_molecule_input = line["smiles"]
                 test_molecule_smiles = test_molecule_input.strip("\"")
                 test_molecule = Chem.MolFromSmiles(test_molecule_smiles)
                 test_mol_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect(
                     test_molecule, radius)
                 max_sim = max([
                     DataStructs.TanimotoSimilarity(test_mol_fingerprint,
                                                    fingerprint)
                     for fingerprint in active_molecules_tt
                 ])
                 score = {"name": line["name"], "score": max_sim}
                 if first_line:
                     first_line = False
                 else:
                     output_stream.write("\n")
                 json.dump(score, output_stream)
    def pair_similiar(self, valid_smiles):
        if len(valid_smiles) <= 2:
            return 0, 0
        else:

            valid_mols = [Chem.MolFromSmiles(i) for i in valid_smiles]
            valid_fps = [
                AllChem.GetMorganFingerprintAsBitVect(mol,
                                                      2,
                                                      nBits=1024,
                                                      useChirality=True)
                for mol in valid_mols
            ]
            pair_similiar = []
            for i in range(len(valid_fps)):
                for j in range(i + 1, len(valid_fps)):
                    fp_i = valid_fps[i]
                    fp_j = valid_fps[j]
                    pair_similiar.append(
                        DataStructs.TanimotoSimilarity(fp_i, fp_j))

            pair_similiar_numpy = np.array(pair_similiar)
            very_similiar = pair_similiar_numpy[pair_similiar_numpy > 0.75]
            very_similiar_rate = very_similiar.shape[0] / len(pair_similiar)
            mean_pair_similiar = sum(pair_similiar) / len(pair_similiar)
            return str(very_similiar_rate), str(mean_pair_similiar)
Пример #16
0
 def getSimilarity(self, reference, method='tanimoto', alpha=None, beta=None):
     if   method == 'tanimoto':
         return DataStructs.TanimotoSimilarity(reference.IFPvector, self.IFPvector)
     elif method == 'dice':
         return DataStructs.DiceSimilarity(reference.IFPvector, self.IFPvector)
     elif method == 'tversky':
         return DataStructs.TverskySimilarity(reference.IFPvector, self.IFPvector, alpha, beta)
    def score_model(self, model_configuration: dict, fragments_file: str,
                    descriptors_file: str, output_file: str):
        inputoutput_utils.create_parent_directory(output_file)
        model_data = model_configuration["data"]
        active_molecules_ap = []
        nbits = model_configuration["configuration"]["nbits"]
        for active_molecule in model_data["active"]:
            molecule_smiles = active_molecule.strip("\"")
            molecule = Chem.MolFromSmiles(molecule_smiles)
            ap_fingerprint = Pairs.GetHashedAtomPairFingerprint(molecule,
                                                                nBits=nbits)
            active_molecules_ap.append(ap_fingerprint)

        first_line = True
        with open(output_file, "w", encoding="utf-8") as output_stream:
            with open(fragments_file, "r", encoding="utf-8") as input_stream:
                for new_line in input_stream:
                    line = json.loads(new_line)
                    test_molecule_input = line["smiles"]
                    test_molecule_smiles = test_molecule_input.strip("\"")
                    test_molecule = Chem.MolFromSmiles(test_molecule_smiles)
                    test_mol_fingerprint = Pairs.GetHashedAtomPairFingerprint(
                        test_molecule, nBits=nbits)
                    max_sim = max([
                        DataStructs.TanimotoSimilarity(test_mol_fingerprint,
                                                       fingerprint)
                        for fingerprint in active_molecules_ap
                    ])
                    score = {"name": line["name"], "score": max_sim}
                    if first_line:
                        first_line = False
                    else:
                        output_stream.write("\n")
                    json.dump(score, output_stream)
Пример #18
0
def search_files(files, fingerprints):
    with open('score.csv', 'w') as csvfile:
        score_writer = csv.writer(csvfile, delimiter=' ')
        for file in files:
            print('Processing file %s at %s'%(file, str(datetime.datetime.now())))
            smiles = pickle.load( open(file, 'rb') )
    
            # Precompute bitvectors
            fps = []
            for i, row in smiles.iterrows():
                try:
                    fps += [DataStructs.ExplicitBitVect(base64.b64decode(row['fingerprint']))]
                except:
                    fps += [None]
                    print('None')
                if i%100000==0 and i>0:
                    print('.', end='', flush=True)
                if i%1000000==0 and i>0:
                    if i%10000000==0:
                        print('M', end='', flush=True)
                    else:
                        print('m', end='', flush=True)

            smiles['fp'] = fps
            print('\n  Precomputed at %s '%(str(datetime.datetime.now())), end='')
    
            plt.rcParams["figure.figsize"] = (12,10)
            plt.figure()
            plt.title(file, fontsize=12)
            # For each of our target SMILE strings
            for (insmile, fp2) in fingerprints:
                print('\n  %s '%insmile, sep='')
                scores = []
                # For each row in the file we are comparing against
                for i, row in smiles.iterrows():
                    try:
                        score = DataStructs.TanimotoSimilarity(row['fp'], fp2)
                        scores += [(row['canonical_smile'], score)]
                    except:
                        pass
                    if i%100000==0 and i>0:
                        print('.', end='', flush=True)
                    if i%1000000==0 and i>0:
                        if i%10000000==0:
                            print('M', end='', flush=True)
                        else:
                            print('m', end='', flush=True)
                print(' %s'%(str(datetime.datetime.now())))
                # Add line to graph
                sorted_scores = sorted(scores, key=itemgetter(1))
                scores_only = [x[1] for x in sorted_scores]
                plt.step(np.arange(len(sorted_scores)), np.array(scores_only), label=insmile, linewidth=0.5)
                
                # Select top 200
                lastN = sorted_scores[-200:]
                lastN.reverse()
                for (smile, score) in lastN:
                    score_writer.writerow([file, '%.6f'%score, insmile, smile])
            plt.legend(fontsize=6)
            plt.savefig('fig%d.pdf'%(random.randint(1,100000)))
Пример #19
0
 def computeSimilarityFP(self, c_chem, typeFP, typeMetric):
     
     try:
         if typeMetric == 'Tanimoto':
             return DataStructs.TanimotoSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
         elif typeMetric == "Dice":
             return DataStructs.DiceSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
         elif typeMetric == "Cosine":
             return DataStructs.CosineSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
         elif typeMetric == "Sokal":
             return DataStructs.SokalSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
         elif typeMetric == "Russel":
             return DataStructs.RusselSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
         elif typeMetric == "RogotGoldberg":
             return DataStructs.RogotGoldbergSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
         elif typeMetric == "AllBit":
             return DataStructs.AllBitSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
         elif typeMetric == "Kulczynski":
             return DataStructs.KulczynskiSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
         elif typeMetric == "McConnaughey":
             return DataStructs.McConnaugheySimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
         elif typeMetric == "Asymmetric":
             return DataStructs.AsymmetricSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
         elif typeMetric == "BraunBlanquet":
             return DataStructs.BraunBlanquetSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP])
     except:
         print("Combination %s and %s not supported"%(typeFP, typeMetric))
         self.log = "%sCombination %s and %s not supported\n"%(self.log, typeFP, typeMetric)
         return "NA"
Пример #20
0
    def get_similarity(self, smiles):
        structure = Chem.MolFromSmiles(smiles)
        if structure is None:
            return 0.0
        fingerprint_structure = self.get_fingerprint(structure)

        return DataStructs.TanimotoSimilarity(self._target_mol_fingerprint, fingerprint_structure)
Пример #21
0
def diversity(smiles_list):
    """
    Function that takes as input a list containing SMILES strings to compute
    its internal diversity
    Parameters
    ----------
    smiles_list: List with valid SMILES strings
    Returns
    -------
    This function returns the internal diversity of the list given as input, 
    based on the computation Tanimoto similarity
    """
    td = 0

    fps_A = []
    for i, row in enumerate(smiles_list):
        try:
            mol = Chem.MolFromSmiles(row)
            fps_A.append(AllChem.GetMorganFingerprint(mol, 6))
        except:
            print('ERROR: Invalid SMILES!')

    for ii in range(len(fps_A)):
        for xx in range(len(fps_A)):
            tdi = 1 - DataStructs.TanimotoSimilarity(fps_A[ii], fps_A[xx])
            td += tdi

    td = td / len(fps_A)**2

    return td
Пример #22
0
def cal_sim(q, ref_data, return_dict_sim):

    Nref = len(ref_data)
    nbits = 1024
    while True:
        qqq = q.get()
        if qqq == 'DONE':
            #            print('proc =', os.getpid())
            break
        idx, smi = qqq

        if idx % 10000 == 0:
            print(idx)
        Nsmi = len(smi)
        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            continue
        if Chem.SanitizeMol(mol, catchErrors=True):
            continue

        com_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits)
        sim_data = []
        for j in range(Nref):
            ref_fp = ref_data[j][1]
            sim = DataStructs.TanimotoSimilarity(com_fp, ref_fp)
            sim_data += [sim]
        similarity = np.array(sim_data)
        j_max = similarity.argmax()
        sim_max = similarity[j_max]
        return_dict_sim[idx] = [sim_max, j_max]
Пример #23
0
def calculate_internal_diversity(smiles, ref_smiles,radius=4):
    """
    Calculates internal diversity of the given compounds.
    See http://arxiv.org/abs/1708.08227

    Arguments:
    ------------
    :param smiles: list or tuple
        Compounds to be used for calculating internal diversity
    :param radius: int
        The circular fingerprint radius (NB: 2 corresponds to ECFP4)
    :return: float
        internal diversity value
    """
    diversity = np.zeros((len(smiles), len(smiles)))
    mols = []
    for s in smiles:
        mol = Chem.MolFromSmiles(s)
        if mol is not None:
            mols.append(mol)
    compounds = [AllChem.GetMorganFingerprint(m, radius) for m in mols]
    hist = {}
    for i in range(len(compounds)):
        c1 = compounds[i]
        for j, c2 in enumerate(compounds):
            if (c1, c2) in hist:
                td = hist[(c1, c2)]
            else:
                td = 1 - DataStructs.TanimotoSimilarity(c1, c2)
                hist[(c1, c2)] = hist[(c2, c1)] = td
            diversity[i, j] = td
    return diversity
Пример #24
0
def _smilarity_between_two_mols(mol1, mol2):
    # mol1, mol2 = Chem.MolFromSmiles(smi1), Chem.MolFromSmiles(smi2)
    vec1 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol1, 4, nBits=512)
    vec2 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol2, 4, nBits=512)

    tani = DataStructs.TanimotoSimilarity(vec1, vec2)
    return tani
Пример #25
0
def pharmacophore(mol, target):
    i = 0
    print('mol/target', mol, target)
    mol.standardize()
    target.standardize()
    mol = str(mol)
    mol = mol.replace('N(=O)O', '[N+](=O)[O-]')
    mol = mol.replace('N(O)=O', '[N+]([O-])=O')
    mol = mol.replace('n(O)', '[n+]([O-])')
    target = str(target)
    target = target.replace('N(=O)O', '[N+](=O)[O-]')
    target = target.replace('N(O)=O', '[N+]([O-])=O')
    target = target.replace('n(O)', '[n+]([O-])')
    featfactory = load_factory()
    sigfactory = SigFactory(featfactory,
                            minPointCount=2,
                            maxPointCount=3,
                            trianglePruneBins=False)
    sigfactory.SetBins([(0, 2), (2, 5), (5, 8)])
    sigfactory.Init()
    mol1 = Chem.MolFromSmiles(mol)
    mol2 = Chem.MolFromSmiles(target)
    if mol1 and mol2:
        fp1 = Generate.Gen2DFingerprint(mol1, sigfactory)
        fp2 = Generate.Gen2DFingerprint(mol2, sigfactory)
        sims = DataStructs.TanimotoSimilarity(fp1, fp2)
        return sims
    else:
        i = i + 1
        print('ошибка', i, mol)
        return -100
def struct_score(SMILES1, SMILES2):
    from rdkit import Chem, DataStructs
    mol1 = Chem.MolFromSmiles(SMILES1)
    mol2 = Chem.MolFromSmiles(SMILES2)
    fp1 = Chem.RDKFingerprint(mol1)
    fp2 = Chem.RDKFingerprint(mol2)
    return DataStructs.TanimotoSimilarity(fp1, fp2)
Пример #27
0
  def test6BulkTversky(self):
    """

    """
    sz = 10
    nToSet = 5
    nVs = 6
    import random
    vs = []
    for i in range(nVs):
      v = ds.IntSparseIntVect(sz)
      for j in range(nToSet):
        v[random.randint(0, sz - 1)] = random.randint(1, 10)
      vs.append(v)

    baseDs = [ds.TverskySimilarity(vs[0], vs[x], .5, .5) for x in range(1, nVs)]
    bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 0.5, 0.5)
    diceDs = [ds.DiceSimilarity(vs[0], vs[x]) for x in range(1, nVs)]
    for i in range(len(baseDs)):
      self.assertTrue(feq(baseDs[i], bulkDs[i]))
      self.assertTrue(feq(baseDs[i], diceDs[i]))

    bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 1.0, 1.0)
    taniDs = [ds.TanimotoSimilarity(vs[0], vs[x]) for x in range(1, nVs)]
    for i in range(len(bulkDs)):
      self.assertTrue(feq(bulkDs[i], taniDs[i]))
    taniDs = ds.BulkTanimotoSimilarity(vs[0], vs[1:])
    for i in range(len(bulkDs)):
      self.assertTrue(feq(bulkDs[i], taniDs[i]))
def create_tanimoto_index(similarity_value, aglycon_formulas, fps,
                          df_Without_Double_or_Triple):
    """
    Gets the similarity value, the Morgan-Fingerprint of each aglycon and a data frame with only single 
    entries in the taxonomy row.
    
    Checks the Tanimito Index for all possible pairs of two aglycons. If the value of the Tanimoto 
    Index is above the given similarity value, the aglycons and their Tanimoto Index are appended to a new data frame.
    
    Passes the new three column data frame with both aglycons and their Tanimoto Index.
    """
    aglycon_pairs = itertools.combinations(fps, 2)
    aglycon1 = []
    aglycon2 = []
    tanimoto = []
    counter = 0
    for pair in aglycon_pairs:
        fingerprint = DataStructs.TanimotoSimilarity(pair[0], pair[1])
        if fingerprint >= similarity_value:
            aglycon1.append(aglycon_formulas[counter][0])
            aglycon2.append(aglycon_formulas[counter][1])
            tanimoto.append(fingerprint)
        counter += 1
    #print(counter)
    df_comparison = pd.DataFrame({
        "aglycon1": aglycon1,
        "aglycon2": aglycon2,
        "tanimoto_index": tanimoto
    })
    create_df_with_tanimoto_index(df_comparison, df_Without_Double_or_Triple)
Пример #29
0
def compute_fraggle_similarity_for_subs(inMol,
                                        qMol,
                                        qSmi,
                                        qSubs,
                                        tverskyThresh=0.8):
    qFP = Chem.RDKFingerprint(qMol, **rdkitFpParams)
    iFP = Chem.RDKFingerprint(inMol, **rdkitFpParams)

    rdkit_sim = DataStructs.TanimotoSimilarity(qFP, iFP)

    qm_key = "%s_%s" % (qSubs, qSmi)
    if qm_key in modified_query_fps:
        qmMolFp = modified_query_fps[qm_key]
    else:
        qmMol = atomContrib(qSubs, qMol, tverskyThresh)
        qmMolFp = Chem.RDKFingerprint(qmMol, **rdkitFpParams)
        modified_query_fps[qm_key] = qmMolFp

    rmMol = atomContrib(qSubs, inMol, tverskyThresh)

    # wrap in a try, catch
    try:
        rmMolFp = Chem.RDKFingerprint(rmMol, **rdkitFpParams)
        fraggle_sim = max(DataStructs.FingerprintSimilarity(qmMolFp, rmMolFp),
                          rdkit_sim)
    except Exception:  # pragma: nocover
        sys.stderr.write("Can't generate fp for: %s\n" %
                         (Chem.MolToSmiles(rmMol, True)))
        fraggle_sim = 0.0

    return rdkit_sim, fraggle_sim
Пример #30
0
    def test10BulkOps3(self):
        nbits = 10000
        bvs = numpy.empty((10, ), DataStructs.ExplicitBitVect)
        for bvi in range(10):
            bv = DataStructs.ExplicitBitVect(nbits)
            for j in range(nbits):
                x = random.randrange(0, nbits)
                bv.SetBit(x)
            bvs[bvi] = bv
        sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.DiceSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1)
        for i in range(len(bvs)):
            sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1)
            self.assertTrue(feq(sim, sims[i]))
            sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))

        sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5)
        for i in range(len(bvs)):
            sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5)
            self.assertTrue(feq(sim, sims[i]))
            sim = DataStructs.DiceSimilarity(bvs[0], bvs[i])
            self.assertTrue(feq(sim, sims[i]))