예제 #1
0
    def test__init__(self):
        from rdkit.Chem.Fingerprints import FingerprintMols
        ms = [
            Chem.MolFromSmiles('CCOC'),
            Chem.MolFromSmiles('CCO'),
            Chem.MolFromSmiles('COC')
        ]
        fps = [FingerprintMols.FingerprintMol(x) for x in ms]
        self.assertAlmostEqual(FingerprintSimilarity(fps[0], fps[1]),
                               0.6,
                               places=2)

        details = FingerprinterDetails()
        fpArgs = details.__dict__
        fps = []
        for i, x in enumerate(ms, 1):
            fpArgs['fpSize'] = 16 * i
            fps.append(FingerprintMols.FingerprintMol(x, **fpArgs))
        self.assertAlmostEqual(FingerprintSimilarity(fps[0], fps[1]),
                               0.555,
                               places=2)
        self.assertAlmostEqual(FingerprintSimilarity(fps[1], fps[0]),
                               0.555,
                               places=2)

        fpArgs['fpSize'] = 1024
        fpArgs['tgtDensity'] = 0.8
        fp = FingerprintMols.FingerprintMol(ms[0], **fpArgs)
        self.assertEqual(len(fp), 64)
        fp = DataStructs.FoldToTargetDensity(fp, density=0.1, minLength=2)
        self.assertEqual(len(fp), 4)
예제 #2
0
 def calc_similarity(similarity_method, fp1, fp2):
     if similarity_method == "dice":
         return FingerprintSimilarity(
             fp1, fp2, metric=DataStructs.DiceSimilarity
         )
     else:
         return FingerprintSimilarity(fp1, fp2)
예제 #3
0
 def get_best_structural_score(self, target_fp, metric):
     scores = {}
     for rxn_id in self.fingerprints:
         stru_comp, diff_comp = self.fingerprints[rxn_id]
         score = FingerprintSimilarity(fp1=target_fp,
                                       fp2=stru_comp,
                                       metric=metric)
         scores[rxn_id] = score
     return scores
예제 #4
0
    def test_fingerprint_bit_string_readwrite(self):
        """
        Test that the similarity of two molecules isn't changed when we 
        write and then read the fingerprint (as a BitString) from the database.
        """
        total = 0
        for samples in self.pairs:
            mol_x = Chem.MolFromSmiles(samples[0], sanitize=False)
            mol_y = Chem.MolFromSmiles(samples[1], sanitize=False)

            sanitize_without_hypervalencies(mol_x)
            sanitize_without_hypervalencies(mol_y)

            fp_x = Chem.RDKFingerprint(mol_x)
            fp_y = Chem.RDKFingerprint(mol_y)

            similarity_before = FingerprintSimilarity(fp_x, fp_y)

            bitString_x = BitVectToText(fp_x)
            bitString_y = BitVectToText(fp_y)

            self.c.execute("INSERT INTO data VALUES (?)", (bitString_x, ))
            self.c.execute("INSERT INTO data VALUES (?)", (bitString_y, ))

            read_x = self.c.execute(
                f"SELECT * FROM data WHERE fingerprints='{bitString_x}'"
            ).fetchone()[0]
            read_y = self.c.execute(
                f"SELECT * FROM data WHERE fingerprints='{bitString_y}'"
            ).fetchone()[0]

            new_fp_x = CreateFromBitString(read_x)
            new_fp_y = CreateFromBitString(read_y)

            similarity_after = FingerprintSimilarity(new_fp_x, new_fp_y)

            if abs(similarity_after - similarity_before) > 0.01:
                print(
                    f"similarity before = {similarity_before}, similarity after = {similarity_after}"
                )
                total += 1

        self.assertTrue(total == 0,
                        f"Some Similarities were not equal, total={total}")
예제 #5
0
 def similarity_between_two_smiles(smile_1: str, smile_2: str) -> float:
     try:
         mod_1 = MolFromSmiles(smile_1)
         mod_2 = MolFromSmiles(smile_2)
         ebv_1 = GenMACCSKeys(mod_1)
         ebv_2 = GenMACCSKeys(mod_2)
         similarity = FingerprintSimilarity(ebv_1, ebv_2)
     except BaseException:
         return -1
     return round(similarity, 3)
예제 #6
0
def get_tanimoto(list1, list2):
    tcs = []
    for fp1 in list1:
        for fp2 in list2: 
            if fp1 is None or fp2 is None:
                tcs.append(None)
            else:
                tc = FingerprintSimilarity(fp1, fp2)
                tcs.append(tc)
    return(tcs)
예제 #7
0
def match_substrates(fragments, library):

    matches = []

    for f in fragments:
        for template in library:
            if (FingerprintSimilarity(FingerprintMol(template[0]),
                                      FingerprintMol(f)) > 0.9):
                matches.append(template[1])

    return matches
예제 #8
0
 def get_best_structural_score(self, smarts, metric):
     scores = {}
     stru_score_high = 0
     target_fp, _ = ReactionFingerprintMatcher.make_fingerprints(smarts)
     for rxn_id in self.fingerprints:
         stru_comp, _ = self.fingerprints[rxn_id]
         score = FingerprintSimilarity(fp1=target_fp,
                                       fp2=stru_comp,
                                       metric=metric)
         scores[rxn_id] = score
         if score > stru_score_high:
             stru_score_high = score
     return scores, stru_score_high
예제 #9
0
 def test_keep_similar_samples(self):
     samp = self.sampler.sample(100, filter_similar=False, verbose=False)
     scores = list()
     i, j = 0, 0
     while i < len(samp) - 1:
         j = i + 1
         mol1 = Chem.MolFromSmiles(samp[i])
         fp1 = GetMorganFingerprintAsBitVect(mol1, 4, nBits=1024)
         while j < len(samp):
             mol2 = Chem.MolFromSmiles(samp[j])
             fp2 = GetMorganFingerprintAsBitVect(mol2, 4, nBits=1024)
             score = FingerprintSimilarity(fp1, fp2)
             scores.append(score)
             j += 1
         i += 1
     self.assertFalse(all([s < 0.85 for s in scores]))
예제 #10
0
def internal_diversity(fps, sample_size=1e4, summarise=True):
    """
    Calculate the internal diversity, defined as the mean intra-set Tanimoto
    coefficient, between a set of fingerprints. For large sets, calculating the
    entire matrix is prohibitive, so a random set of molecules are sampled.
    """
    tcs = []
    counter = 0
    while counter < sample_size:
        idx1 = random.randint(0, len(fps) - 1)
        idx2 = random.randint(0, len(fps) - 1)
        fp1 = fps[idx1]
        fp2 = fps[idx2]
        tcs.append(FingerprintSimilarity(fp1, fp2))
        counter += 1
    if summarise:
        return np.mean(tcs)
    else:
        return tcs
예제 #11
0
 def test_correct_filter_similar_samples(self):
     samp = self.sampler.sample(60,
                                filter_similar=True,
                                threshold=0.3,
                                verbose=False)
     scores = list()
     i, j = 0, 0
     while i < len(samp) - 1:
         j = i + 1
         mol1 = Chem.MolFromSmiles(samp[i])
         fp1 = GetMorganFingerprintAsBitVect(mol1, 4, nBits=2048)
         while j < len(samp):
             mol2 = Chem.MolFromSmiles(samp[j])
             fp2 = GetMorganFingerprintAsBitVect(mol2, 4, nBits=2048)
             score = FingerprintSimilarity(fp1, fp2)
             scores.append(score)
             j += 1
         i += 1
     self.assertTrue(all([s < 0.3 for s in scores]))
예제 #12
0
def external_nn(fps1, fps2, sample_size=1e3, summarise=True):
    """
    Calculate the nearest-neighbor Tanimoto coefficient, searching one set of
    fingerprints against a second set.
    """
    counter = 0
    nns = []
    while counter < sample_size:
        idx1 = random.randint(0, len(fps1) - 1)
        fp1 = fps1[idx1]
        tcs = []
        for idx2 in range(len(fps2)):
            fp2 = fps2[idx2]
            tcs.append(FingerprintSimilarity(fp1, fp2))
        nn = np.max(tcs)
        nns.append(nn)
        counter += 1
    if summarise:
        return np.mean(nns)
    else:
        return nns
예제 #13
0
def compare_lig_db(fp, lig_datasets, method = 'tanimoto', same = None, same_db = ''):
    '''
        Compares pairwise similarity between molecules from two given sets.
    '''
    matched_ligands = {}
    if same:
        combs = (same_db, same_db)
    else:
        combs = combinations(lig_datasets.keys(), 2)
    
    for key_i, key_j in combs:
        print('\n' + '='*20)
        print(key_i, '\t', key_j)
        print('='*20)
        d_i = lig_datasets[key_i]
        d_j = lig_datasets[key_j]

        # Create the list
        matched = []
        for k in d_i.index:
            for p in d_j.index:
                try:
                    fp_sim = FingerprintSimilarity(
                        d_i.loc[k, fp], 
                        d_j.loc[p, fp], metric=DataStructs.TanimotoSimilarity)

                    if fp_sim >= 0.90:
                        # Add to the list
                        matched.append( {'match_mols': (d_i.loc[k, 'mol_rdk'], 
                                                   d_j.loc[p, 'mol_rdk']), 
                                         'match_names': (k, p),
                                         'tanimoto': fp_sim} )
                    if fp_sim >= 0.98:
                        print(k, '\t', p)
                except AttributeError as e:
                    print(e, k, '\t', p)
                    break
        # add to the dict
        matched_ligands[F'{key_i}-{key_j}'] = matched
    return matched_ligands
예제 #14
0
def fp_similarity(fp1, fp2, metric='tanimoto'):
    """ Calculate the Tanimoto similarity between two fingerprints

    :param fp1: {numpy array / RDKit fingerprint} Fingerprint 1
    :param fp2: {numpy array / RDKit fingerprint} Fingerprint 2
    :param metric: {str} which similarity metric to use, default: tanimoto; available for numpy fingerprints:
        tanimoto, cosine, euclidean
    :return: Tanimoto similarity
    """
    if isinstance(fp1, cDataStructs.ExplicitBitVect):
        return FingerprintSimilarity(fp1, fp2, metric=TanimotoSimilarity)
    elif isinstance(fp1, np.ndarray):
        if metric.lower() == 'tanimoto':
            return tanimoto(fp1, fp2)
        elif metric.lower() == 'cosine':
            return cosine_dist(fp1, fp2)
        elif metric.lower() == 'euclidean':
            return euclidean_dist(fp1, fp2)
        else:
            raise NotImplementedError('Only the following distance metrics are available: tanimoto, cosine, euclidean')
    else:
        raise TypeError("Fingerprints must be of type numpy.ndarray or rdkit.DataStructs.cDataStructs.ExplicitBitVect")
예제 #15
0
def internal_nn(fps, sample_size=1e3, summarise=True):
    """
    Calculate the nearest-neighbor Tanimoto coefficient within a set of
    fingerprints.
    """
    counter = 0
    nns = []
    while counter < sample_size:
        idx1 = random.randint(0, len(fps) - 1)
        fp1 = fps[idx1]
        tcs = []
        for idx2 in range(len(fps)):
            if idx1 != idx2:
                fp2 = fps[idx2]
                tcs.append(FingerprintSimilarity(fp1, fp2))
        nn = np.max(tcs)
        nns.append(nn)
        counter += 1
    if summarise:
        return np.mean(nns)
    else:
        return nns
plt.legend()
plt.show()


topn = result[:n, :]
topn_idx = topn[:, 0].astype(np.str)

chem_map = pd.read_csv("data/chem_all.csv").to_numpy()
topn_smile = [chem_map[int(idx.split('_')[1]), 0] for idx in topn_idx]


sm = np.zeros((n, n))
for i in range(n):
    for j in range(i, n):
        m1, m2 = Chem.MolFromSmiles(topn_smile[i]), Chem.MolFromSmiles(topn_smile[j])
        sm[i, j] = FingerprintSimilarity(Chem.RDKFingerprint(m1), Chem.RDKFingerprint(m2))

sm = sm + sm.T - np.eye(n)

from sklearn.cluster import AffinityPropagation

af = AffinityPropagation().fit(sm)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_

n_clusters_ = len(cluster_centers_indices)
print("{} clusters: ".format(n_clusters_))
print("    Center index: {}".format(cluster_centers_indices.tolist()))
print("    Labels: {}".format(labels.tolist()))

send = {'1EVZ': ['CHEM_833', 'CHEM_84524', 'CHEM_6372', 'CHEM_28096', 'CHEM_16023'],
예제 #17
0
        for line in f:
            zinc = line.split()[-2]
            if zinc in exclude:
                continue
            smile = line.split()[0]
            chem = Chem.MolFromSmiles(smile)
            chems.append(chem)
            mol_weights.append(ExactMolWt(chem))
            sssrs.append(Chem.GetSSSR(chem))
            balabanjs.append(BalabanJ(chem))
            bertzcts.append(BertzCT(chem))

    print_stat('Exact mol weight', mol_weights)
    print_stat('SSSR', sssrs)
    print_stat('BalabanJ', balabanjs)
    print_stat('BertzCT', bertzcts)

    fps = [Chem.RDKFingerprint(chem) for chem in chems]
    tanimotos = []
    for i, (fp1, fp2) in enumerate(combinations(fps, 2)):
        tanimotos.append(FingerprintSimilarity(fp1, fp2))

    print_stat('Tanimoto (RDK Fingerprint)', tanimotos)

    fps = [GetMorganFingerprintAsBitVect(chem, 2) for chem in chems]
    tanimotos = []
    for i, (fp1, fp2) in enumerate(combinations(fps, 2)):
        tanimotos.append(FingerprintSimilarity(fp1, fp2))

    print_stat('Tanimoto (Morgan Fingerprint)', tanimotos)
예제 #18
0
            chem = Chem.MolFromSmiles(smile)
            #train2fp[name] = Chem.RDKFingerprint(chem)
            train2fp[name] = GetMorganFingerprintAsBitVect(chem, 2)
            train2common[name] = fields[0]
    with open('data/prediction_results.txt') as f:
        for line in f:
            fields = line.rstrip().split('\t')
            name = fields[3]
            train2fp[name] = chem2fp[name]
            train2common[name] = fields[4]

    seen_predict = set()
    with open(sys.argv[1]) as f:
        for line in f:
            fields = line.rstrip().split('\t')
            name = fields[3]
            if name in seen_predict:
                continue
            else:
                seen_predict.add(name)
            common = fields[4]
            fp = chem2fp[name]
            max_tan, max_chem = 0, None
            for chem in train2fp:
                tan = FingerprintSimilarity(fp, train2fp[chem])
                if tan > max_tan:
                    max_tan = tan
                    max_chem = chem
            ofields = [name, common, max_chem, train2common[max_chem], max_tan]
            print('\t'.join([str(ofield) for ofield in ofields]))
예제 #19
0
from rdkit.DataStructs import FingerprintSimilarity

# Get contents of Mainland cabinet (any dataframe could do)
cabinet = get_mainland()
# Get rid of any molecules without a valid SMILES stting
cabinet = cabinet.dropna(subset=['SMILES'])
# rdkit can't make mol from this one so drop that, too
cabinet = cabinet.drop(24766)
cids = list(cabinet.index)
cabinet.head()

# Compute all fingerprints
fps = cabinet['SMILES'].apply(Chem.MolFromSmiles).apply(Chem.RDKFingerprint)
# And then Tanimot distances
tanimoto = pd.DataFrame(index=cids, columns=cids)
tanimoto[:] = [[FingerprintSimilarity(fp1, fp2) for fp1 in fps] for fp2 in fps]
tanimoto.head()


# This could be any function whose first argument is integer indices (not CIDs!) into a dataframe
# of info about odorants (one odorant per row)
def mean_dist(indices, sim):
    """Return the summed Tanimoto distance of all pairs"""
    pairs = combinations(indices, 2)
    return np.mean([sim.iloc[x[0], x[1]] for x in pairs])


# Prettier printing
np.set_printoptions(precision=2, suppress=True)
# Some weight you can make up, each tuple is one weight under consideration.
# Tuple item 1 is a cabinet dataframe column or a custom name
예제 #20
0
molecules = []
for file in filenames:
    molecules.append(Chem.MolFromMol2File(conv_dir+file))
    
#%%
from rdkit.Chem.Fingerprints import FingerprintMols
fingerpints = [FingerprintMols.FingerprintMol(mol) for mol in molecules]
#%%
from rdkit.DataStructs import FingerprintSimilarity
import numpy as np

simmat = np.zeros(shape=(len(molecules), len(molecules)))

for i in range(len(molecules)):
    for j in range(i + 1, len(molecules)):
        simmat[i,j] = FingerprintSimilarity(fingerpints[i], fingerpints[j])

#%%
import matplotlib.pyplot as plt
plt.imshow(simmat)

#%%
hi_sim_ids0 = np.where( simmat ==1)
for mol1, mol2 in zip(hi_sim_ids0[0],hi_sim_ids0[1]):
    print(filenames[mol1])
    print(filenames[mol2])
    print('=======')
    
#%%
from rdkit.Chem import MACCSkeys
fingerpints2 = [MACCSkeys.GenMACCSKeys(mol) for mol in molecules]