示例#1
0
    def similarity(self, smiles1, smiles2):
        if not smiles1 or not smiles2:
            return None

        smiles1 = re.sub(r'\[\*:[0-9]*\]', "[H]", smiles1)
        smiles2 = re.sub(r'\[\*:[0-9]*\]', "[H]", smiles2)

        mol1 = Chem.MolFromSmiles(smiles1)
        mol2 = Chem.MolFromSmiles(smiles2)

        fp1 = Chem.RDKFingerprint(mol1)
        fp2 = Chem.RDKFingerprint(mol2)

        return {
            "Tanimoto":
            DataStructs.FingerprintSimilarity(
                fp1, fp2, metric=DataStructs.TanimotoSimilarity),
            "Dice":
            DataStructs.FingerprintSimilarity(
                fp1, fp2, metric=DataStructs.DiceSimilarity),
            "Cosine":
            DataStructs.FingerprintSimilarity(
                fp1, fp2, metric=DataStructs.CosineSimilarity),
            "Sokal":
            DataStructs.FingerprintSimilarity(
                fp1, fp2, metric=DataStructs.SokalSimilarity),
            "Russel":
            DataStructs.FingerprintSimilarity(
                fp1, fp2, metric=DataStructs.RusselSimilarity),
            "Kulczynski":
            DataStructs.FingerprintSimilarity(
                fp1, fp2, metric=DataStructs.KulczynskiSimilarity),
            "McConnaughey":
            DataStructs.FingerprintSimilarity(
                fp1, fp2, metric=DataStructs.McConnaugheySimilarity),
        }
示例#2
0
  def test5Dice(self):
    """

    """
    v1 = ds.IntSparseIntVect(5)
    v1[4] = 4
    v1[0] = 2
    v1[3] = 1
    self.assertTrue(feq(ds.DiceSimilarity(v1, v1), 1.0))

    v1 = ds.IntSparseIntVect(5)
    v1[0] = 2
    v1[2] = 1
    v1[3] = 4
    v1[4] = 6
    v2 = ds.IntSparseIntVect(5)
    v2[1] = 2
    v2[2] = 3
    v2[3] = 4
    v2[4] = 4
    self.assertTrue(feq(ds.DiceSimilarity(v1, v2), 18.0 / 26.))
    self.assertTrue(feq(ds.DiceSimilarity(v2, v1), 18.0 / 26.))
def get_fitness(genes, target):
    ms = [Chem.MolFromSmiles(target), Chem.MolFromSmiles(genes)]
    fps = [FingerprintMols.FingerprintMol(x) for x in ms]
    return DataStructs.FingerprintSimilarity(fps[0], fps[1]), None
示例#4
0
def eval_similarity(fp_list, dim, evaluator):
    s_list = []
    for i in range(len(fp_list) - 1):
        for j in range(i + 1, len(fp_list)):
            s_list.append(evaluator(fp_list[i][dim], fp_list[j][dim]))
    s_list = np.array(s_list)
    return np.mean(s_list), np.std(s_list)


if __name__ == '__main__':
    f = sys.argv[1]
    fp_func_list = [
        lambda x: AllChem.GetMorganFingerprint(x, 2),
        lambda x: MACCSkeys.GenMACCSKeys(x),
        lambda x: Pairs.GetAtomPairFingerprint(x),
        lambda x: FingerprintMols.FingerprintMol(x)
    ]

    evaluators = [
        lambda x, y: DataStructs.DiceSimilarity(x, y),
        lambda x, y: DataStructs.FingerprintSimilarity(x, y),
        lambda x, y: DataStructs.DiceSimilarity(x, y),
        lambda x, y: DataStructs.FingerprintSimilarity(x, y)
    ]

    fp_list = get_fp_list(f, fp_func_list)

    for i in range(len(fp_func_list)):
        m, s = eval_similarity(fp_list, i, evaluators[i])
        print(1 - m, s)
    try:
        a=[rec[1][0],make_fingerprint(Chem.MolFromInchi(rec[1][9]))]
        fps_lincs.append(a)
        print(rec[1][0])
    except Exception:
        continue 
    
#%% calculate similarity

matches=[]
for rec_2 in enumerate(fps_list2):#rec_q in enumerate(fps_lincs):
    best_similarity=0
    best_names=[]
    for rec_1 in enumerate(fps_list1):#rec_db in enumerate(fps_dundee):
        similarity=DataStructs.FingerprintSimilarity(
                                                     rec_2[1][1], rec_1[1][1], DataStructs.TanimotoSimilarity
                                                     )
        #print(qi,di)
        if similarity > best_similarity:
            best_similarity = similarity
            best_names =[rec_1[1][0]]
        elif similarity==best_similarity:
            best_names.append([rec_1[1][0]])
    print(rec_2[0])
    for m in best_names:
        matches.append([rec_2[1][0],m,best_similarity])

        
#%%

matches_df=pd.DataFrame(matches)
示例#6
0
for filename in glob.glob('models/*.pkl'):
    querynn = []
    pool = []
    cur = conn.cursor()
    cur.execute("SELECT stdsmiles FROM actives WHERE UNIPROT = '" +
                filename[7:-4] + "';")
    for row in cur:
        pool.append(calcNormalFingerprints(row[0]))
    with open(filename, 'rb') as fid:
        firstcols.append([u_name[filename[7:-4]], filename[7:-4]])
        bnb = cPickle.load(fid)
    #for each wombat compound get nn
    ncount = int(round(bnb.class_count_[1] * 0.1))
    sim_array = []
    for f in fp:
        sim_array.append(DataStructs.BulkTanimotoSimilarity(f, pool))
    sims = []
    for sim in sim_array:
        sims.append(np.average(np.sort(sim)[-ncount:]))
    matrix.append(sims)
    count += 1
    #update precent finished
    percent = (float(count) / float(t_count)) * 100
    sys.stdout.write(' Performing NN search on Query Molecule: %3d%%\r' %
                     percent)
    sys.stdout.flush()
matrix = np.concatenate((firstcols, matrix), axis=1)
headings = ['Name', 'Uniprot']
for i in range(len(fp)):
    headings.append("C" + str(i + 1))
file.write('\t'.join(headings))
示例#7
0
 def partialSimilarity(atomID):
     """ Determine similarity for the atoms set by atomID """
     # create empty fp
     modifiedFP = DataStructs.ExplicitBitVect(1024)
     modifiedFP.SetBitsFromList(aBits[atomID])
     return DataStructs.TverskySimilarity(subsFp, modifiedFP, 0, 1)
示例#8
0
  for i, mol in enumerate(suppl):
    if not mol:
      continue
    smi = Chem.MolToSmiles(mol, True)
    nm = mol.GetProp(nameField)
    property = float(mol.GetProp(propField))
    fp = GetMolFingerprint(mol, maxPathLength)
    data.append((nm, smi, property, fp))

  logger.info('  got %d molecules' % len(data))

  logger.info('calculating pairs')
  pairs = []
  for i in range(len(data)):
    for j in range(i + 1, len(data)):
      if DataStructs.DiceSimilarity(data[i][-1], data[j][-1]) > similarityThreshold:
        pairs.append((i, j))
    if not (i + 1) % 100:
      logger.info('Done %d molecules' % (i + 1))

  logger.info('  got %d reasonable pairs' % len(pairs))

  logger.info('creating output file')
  print >> outF, 'nameA|nameB|nameAB|smilesA|smilesB|smilesAB|actA|actB|dAct|dist|disparity'
  for i, j in pairs:
    if data[i][2] < data[j][2]:
      i, j = j, i
    nmi, smii, propi, fpi = data[i]
    nmj, smij, propj, fpj = data[j]
    dAct = propi - propj
    dist = 1. - DataStructs.DiceSimilarity(fpi, fpj)
示例#9
0
    def test10BulkOps3(self):
        nbits = 10000
        bvs = numpy.empty((10, ), DataStructs.ExplicitBitVect)
        for bvi in range(10):
            bv = DataStructs.ExplicitBitVect(nbits)
            for j in range(nbits):
                x = random.randrange(0, nbits)
                bv.SetBit(x)
            bvs[bvi] = bv
        sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.DiceSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs)
        for i in range(len(bvs)):
            sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1)
        for i in range(len(bvs)):
            sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1)
            self.failUnless(feq(sim, sims[i]))
            sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))

        sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5)
        for i in range(len(bvs)):
            sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5)
            self.failUnless(feq(sim, sims[i]))
            sim = DataStructs.DiceSimilarity(bvs[0], bvs[i])
            self.failUnless(feq(sim, sims[i]))
示例#10
0
 def __compute_diversity(mol, fps):
     ref_fps = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, 4, nBits=2048)
     dist = DataStructs.BulkTanimotoSimilarity(ref_fps, fps, returnDistance=True)
     score = np.mean(dist)
     return score
示例#11
0
    def test7MultiFPBReaderContains(self):
        basen = os.path.join(RDConfig.RDBaseDir, 'Code', 'DataStructs',
                             'testData')
        mfpbr = DataStructs.MultiFPBReader()
        self.assertEqual(
            mfpbr.AddReader(
                DataStructs.FPBReader(
                    os.path.join(basen, "zinc_random200.1.patt.fpb"))), 1)
        self.assertEqual(
            mfpbr.AddReader(
                DataStructs.FPBReader(
                    os.path.join(basen, "zinc_random200.2.patt.fpb"))), 2)
        self.assertEqual(
            mfpbr.AddReader(
                DataStructs.FPBReader(
                    os.path.join(basen, "zinc_random200.3.patt.fpb"))), 3)
        self.assertEqual(
            mfpbr.AddReader(
                DataStructs.FPBReader(
                    os.path.join(basen, "zinc_random200.4.patt.fpb"))), 4)
        mfpbr.Init()
        self.assertEqual(mfpbr.GetNumBits(), 1024)
        self.assertEqual(len(mfpbr), 4)

        fps = "40081010824820021000500010110410003000402b20285000a4040240010030050000"+\
                "080001420040009000003d04086007080c03b31d920004220400074008098010206080"+\
                "00488001080000c64002a00080000200024c2000602410049200340820200002400010"+\
                "02200106090401056801080182006088101000088a0048"
        ebv = DataStructs.CreateFromFPSText(fps)
        bytes = DataStructs.BitVectToBinaryText(ebv)
        nbrs = mfpbr.GetContainingNeighbors(bytes)
        self.assertEqual(len(nbrs), 9)
        self.assertEqual(nbrs[0][0], 160)
        self.assertEqual(nbrs[0][1], 0)
        self.assertEqual(nbrs[1][0], 163)
        self.assertEqual(nbrs[1][1], 0)
        self.assertEqual(nbrs[2][0], 170)
        self.assertEqual(nbrs[2][1], 0)
        self.assertEqual(nbrs[3][0], 180)
        self.assertEqual(nbrs[3][1], 2)
        self.assertEqual(nbrs[4][0], 182)
        self.assertEqual(nbrs[4][1], 3)
        self.assertEqual(nbrs[5][0], 185)
        self.assertEqual(nbrs[5][1], 0)
        self.assertEqual(nbrs[6][0], 189)
        self.assertEqual(nbrs[6][1], 0)
        self.assertEqual(nbrs[7][0], 192)
        self.assertEqual(nbrs[7][1], 3)
        self.assertEqual(nbrs[8][0], 193)
        self.assertEqual(nbrs[8][1], 0)

        nbrs = mfpbr.GetContainingNeighbors(bytes, numThreads=4)
        self.assertEqual(len(nbrs), 9)
        self.assertEqual(nbrs[0][0], 160)
        self.assertEqual(nbrs[0][1], 0)
        self.assertEqual(nbrs[1][0], 163)
        self.assertEqual(nbrs[1][1], 0)
        self.assertEqual(nbrs[2][0], 170)
        self.assertEqual(nbrs[2][1], 0)
        self.assertEqual(nbrs[3][0], 180)
        self.assertEqual(nbrs[3][1], 2)
        self.assertEqual(nbrs[4][0], 182)
        self.assertEqual(nbrs[4][1], 3)
        self.assertEqual(nbrs[5][0], 185)
        self.assertEqual(nbrs[5][1], 0)
        self.assertEqual(nbrs[6][0], 189)
        self.assertEqual(nbrs[6][1], 0)
        self.assertEqual(nbrs[7][0], 192)
        self.assertEqual(nbrs[7][1], 3)
        self.assertEqual(nbrs[8][0], 193)
        self.assertEqual(nbrs[8][1], 0)
示例#12
0
 def get_prep_value(self, value):
     # convert the ExplicitBitVect instance to the value used by the
     # db driver
     if isinstance(value, ExplicitBitVect):
         value = six.memoryview(DataStructs.BitVectToBinaryText(value))
     return value
示例#13
0
    def optimize(self, smiles, sim_cutoff, lr=2.0, num_iter=20):
        mol_tree = MolTree(smiles)
        mol_tree.recover()
        _, tree_vec, mol_vec = self.encode([mol_tree])

        mol = Chem.MolFromSmiles(smiles)
        fp1 = AllChem.GetMorganFingerprint(mol, 2)

        tree_mean = self.T_mean(tree_vec)
        tree_log_var = -torch.abs(
            self.T_var(tree_vec))  #Following Mueller et al.
        mol_mean = self.G_mean(mol_vec)
        mol_log_var = -torch.abs(
            self.G_var(mol_vec))  #Following Mueller et al.
        mean = torch.cat([tree_mean, mol_mean], dim=1)
        log_var = torch.cat([tree_log_var, mol_log_var], dim=1)
        cur_vec = create_var(mean.data, True)

        visited = []
        for step in range(num_iter):
            prop_val = self.propNN(cur_vec).squeeze()
            grad = torch.autograd.grad(prop_val, cur_vec)[0]
            cur_vec = cur_vec.data + lr * grad.data
            cur_vec = create_var(cur_vec, True)
            visited.append(cur_vec)

        l, r = 0, num_iter - 1
        while l < r - 1:
            mid = (l + r) / 2
            new_vec = visited[mid]
            tree_vec, mol_vec = torch.chunk(new_vec, 2, dim=1)
            new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False)
            if new_smiles is None:
                r = mid - 1
                continue

            new_mol = Chem.MolFromSmiles(new_smiles)
            fp2 = AllChem.GetMorganFingerprint(new_mol, 2)
            sim = DataStructs.TanimotoSimilarity(fp1, fp2)
            if sim < sim_cutoff:
                r = mid - 1
            else:
                l = mid
        """
        best_vec = visited[0]
        for new_vec in visited:
            tree_vec,mol_vec = torch.chunk(new_vec, 2, dim=1)
            new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False)
            if new_smiles is None: continue
            new_mol = Chem.MolFromSmiles(new_smiles)
            fp2 = AllChem.GetMorganFingerprint(new_mol, 2)
            sim = DataStructs.TanimotoSimilarity(fp1, fp2) 
            if sim >= sim_cutoff:
                best_vec = new_vec
        """
        tree_vec, mol_vec = torch.chunk(visited[l], 2, dim=1)
        #tree_vec,mol_vec = torch.chunk(best_vec, 2, dim=1)
        new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False)
        if new_smiles is None:
            return smiles, 1.0
        new_mol = Chem.MolFromSmiles(new_smiles)
        fp2 = AllChem.GetMorganFingerprint(new_mol, 2)
        sim = DataStructs.TanimotoSimilarity(fp1, fp2)
        if sim >= sim_cutoff:
            return new_smiles, sim
        else:
            return smiles, 1.0
示例#14
0
 def from_db_value(self, value, expression, connection, context):
     if value is None:
         return value
     return DataStructs.CreateFromBinaryText(bytes(value))
示例#15
0
# sort canonical SMILES by similarity
print("\n".join(f7(canonical_SMILES)), end="")
import sys
sys.exit()
canonical_SMILES = sorted(set(canonical_SMILES))  # remove duplicates
N = len(canonical_SMILES)
matrice = [[0 for __ in range(N)] for _ in range(N)]
fingerprints = [
    FingerprintMols.FingerprintMol(Chem.MolFromSmiles(s))
    for s in canonical_SMILES
]
for i in range(N):
    for j in range(N):
        if i == j:
            matrice[i][j] = -1
        else:
            matrice[i][j] = DataStructs.FingerprintSimilarity(
                fingerprints[i], fingerprints[j])

result = []
current = N - 1  # start with a last one, for no reason
for i in range(N):
    result.append(canonical_SMILES[current])
    next_index = matrice[current].index(max(matrice[current]))
    for j in range(N):
        matrice[j][current] = -1
    current = next_index

print("\n".join(result), end="")
示例#16
0
def ClusterOnFingerprint(filename, mols=None, fingerprint=0, cutoff=0.8, metric='Tanimoto', outMatrix=False):
    '''Clustering Structure based on Fingerprints in RDKit

    filename: Smile format file saving molecules. If set to None, use given "mols"
    mols: Input molecules. No use if set up "filename"
    cutoff: Cutoff using for Butina Clustering
    fingerprint: Fingerprint to use:
        0 or else:  RDKit Topological Fingerprint
        1: MACCS Fingerprint
        2: Atom Pair Fingerprint (AP)
        3: Topological Torsion Fingerprint (TT)
        4: Morgan Fingerprint similar to ECFP4 Fingerprint
        5: Morgan Fingerprint similar to FCFP4 Fingerprint
    metric: Available similarity metrics include: 
            Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky.
    outMatrix: Change output to a similarity matrix
    Return: Default output "clusters, clusterOut":
        clusters: Clusters containing molecule number.
        clusterOut: Molecular Cluster Number in List.
    '''

    from rdkit import DataStructs
    from rdkit.Chem.Draw import SimilarityMaps
    from rdkit.Chem.Fingerprints import FingerprintMols
    from rdkit.Chem import MACCSkeys
    from rdkit.Chem.AtomPairs import Pairs, Torsions

    if filename:
        suppl = Chem.SmilesMolSupplier(filename)
        mols=[]
        for mol in suppl:
            mols.append(mol)
    molnums=len(mols)

    ### Calculate Molecular Fingerprint
    ## MACCS Fingerprint
    if fingerprint==1:
        fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
    ## Atom Pair Fingerprint (AP)
    elif fingerprint == 2:
        fps = [Pairs.GetAtomPairFingerprint(mol) for mol in mols]
    ## Topological Torsion Fingerprint (TT)
    elif fingerprint == 3:
        fps = [Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) for mol in mols]
    ## Morgan Fingerprint similar to ECFP4 Fingerprint
    elif fingerprint == 4:
        fps = [AllChem.GetMorganFingerprint(mol,2) for mol in mols]
    ## Morgan Fingerprint similar to FCFP4 Fingerprint
    elif fingerprint == 5:
        fps = [AllChem.GetMorganFingerprint(mol,2,useFeatures=True) for mol in mols]
    ## RDKit Topological Fingerprint
    else: #fingerprint==0:
        fps = [FingerprintMols.FingerprintMol(mol) for mol in mols]

    if outMatrix:
        ### Output the Fingerprint similarity Matrix
        metricsAvailable={'tanimoto':DataStructs.TanimotoSimilarity,"dice":DataStructs.DiceSimilarity,
        "cosine": DataStructs.CosineSimilarity, "sokal": DataStructs.SokalSimilarity, "russel": DataStructs.RusselSimilarity, 
        "rogotGoldberg": DataStructs.RogotGoldbergSimilarity, "allbit": DataStructs.AllBitSimilarity, 
        "kulczynski": DataStructs.KulczynskiSimilarity, "mcconnaughey": DataStructs.McConnaugheySimilarity,
        "asymmetric": DataStructs.AsymmetricSimilarity, "braunblanquet": DataStructs.BraunBlanquetSimilarity}
        
        if metric.lower() not in metricsAvailable:
            print "The given metric is unknown!"
            metric='Tanimoto'

        simMetrics=metricsAvailable[metric.lower()]

        ### Calculate Fingerprint similarity Matrix
        simdm=[[0.0]*molnums]*molnums
        for i in range(molnums):
            simdm[i,i]=1.0
            for j in range(i+1,molnums):
                simdm[i,j]=DataStructs.FingerprintSimilarity(fps[i],fps[j],metric=simMetrics)
                simdm[j,i]=DataStructs.FingerprintSimilarity(fps[j],fps[i],metric=simMetrics)

        for i in range(molnums):
            print
            for j in range(molnums):
                print '%3.2f' % simdm[i,j],
        return simdm

    else:
        clusters=ClusterFps(fps, cutoff=1-cutoff, metric='Tanimoto')
        clusterID=0
        clusterOut=[0]*len(mols)
        for cluster in clusters:
            clusterID+=1
            for idx in cluster:
                clusterOut[idx]=clusterID
            ## To depict cluster molecule
            if False:
                if len(cluster)>1:
                    print "Cluster: "
                    for idx in cluster:
                        mol2mpl(mols[idx])
        return clusters, clusterOut      
示例#17
0
 def fp_distance(i, j): return 1 - \
     DataStructs.DiceSimilarity(fps[i], fps[j])
 indexes = picker.LazyPick(fp_distance, n_fps, n_pick, seed=seed)
示例#18
0
true_mols = [Chem.MolFromSmiles(s) for s in true_mols]
true_mols = [x for x in true_mols if x is not None]
true_fps = [
    AllChem.GetMorganFingerprintAsBitVect(x, 3, 2048) for x in true_mols
]

pred_mols = [Chem.MolFromSmiles(s) for s in pred_mols]
pred_mols = [x for x in pred_mols if x is not None]
pred_fps = [
    AllChem.GetMorganFingerprintAsBitVect(x, 3, 2048) for x in pred_mols
]

fraction_similar = 0
for i in range(len(pred_fps)):
    sims = DataStructs.BulkTanimotoSimilarity(pred_fps[i], true_fps)
    if max(sims) >= 0.4:
        fraction_similar += 1

print('novelty:', 1 - fraction_similar / len(pred_mols))

similarity = 0
for i in range(len(pred_fps)):
    sims = DataStructs.BulkTanimotoSimilarity(pred_fps[i], pred_fps[:i])
    similarity += sum(sims)

n = len(pred_fps)
n_pairs = n * (n - 1) / 2
diversity = 1 - similarity / n_pairs
print('diversity:', diversity)
示例#19
0
 def setUp(self):
     self.dirname = os.path.join(RDConfig.RDBaseDir, 'Code', 'DataStructs',
                                 'testData')
     self.filename = os.path.join(self.dirname, 'zim.head100.fpb')
     self.fpbr = DataStructs.FPBReader(self.filename)
     self.fpbr.Init()
示例#20
0
    def testBitVectorMaxMin2(self):
        fps = [
            "11110010101000000000", "00000000000010010000",
            "11001010000000000001", "00100110101000001000",
            "01010110000100011001", "11000110101001000011",
            "00000000001100001111", "00011110110000001101",
            "00000011011110100010", "11000010110001000000",
            "00000100010000010000", "10000001000010110010",
            "00010010000000010100", "00011100100110101000",
            "10001001100110100000", "10000110100110010000",
            "00101110000101000000", "11011101100011100000",
            "10000110000100101000", "00101000100000010001",
            "01000001000010000000", "00101101010100000110",
            "10001000100110110001", "00011000010100000001",
            "00101000001000100011", "00010000100010011001",
            "01100001000100010001", "10000101000001101101",
            "00001000011001011000", "11110000100100100000",
            "10100110000000011010", "00110100010110010010",
            "00000000000001010010", "00100000000010100001",
            "11110011000010001000", "10110001010100001000",
            "00001100100110011011", "00010010100100001110",
            "10100101100010100010", "01100100010100000001",
            "10101110011100000000", "01011000000001000001",
            "00000011100110100010", "01100001010001001001",
            "00001000000001001100", "10011001110000000100",
            "10110000001001100100", "00011000000001001011",
            "11001011010001100010", "10010000000001001011",
            "00010000100111100000", "00001000001110001000",
            "11010000010001100110", "01101001100000111000",
            "01001000001110111000", "10000000000100010010",
            "11001000010010000000", "01010010000100110001",
            "00010001010100100001", "01110010000000010000",
            "10001010000011000001", "00000110000000100100",
            "00010000010001000000", "11101100011010000011",
            "00000010100001010001", "00010000110010000101",
            "00010001001000111001", "01000010001100100110",
            "00110110000000100001", "00100010010110110010",
            "01000000110011001111", "00011000001000110010",
            "01111010101000110100", "00001010000010110110",
            "00110011000011011010", "00111010111010000110",
            "00010011101010000011", "00000001011000010000",
            "00011011101110110000", "00010001101000000001",
            "00010000001010011010", "00000010100100100010",
            "00000010001011000100", "11010000000001011100",
            "00001000110101000001", "00000010000000110010",
            "10000000010011000001", "11110110100100010000",
            "10001111000110001001", "00100110000110000100",
            "00000100100000100100", "00110000101100010100",
            "00001010100000100000", "01011000000011000111",
            "00010000100001010001", "10000010100000010000",
            "00001000000000110010", "00001000101011010001",
            "00011110000100100000", "11001001010001010100"
        ]
        N = 5
        fps = [DataStructs.CreateFromBitString(x) for x in fps]
        picker = rdSimDivPickers.MaxMinPicker()
        mm1 = picker.LazyBitVectorPick(fps, len(fps), N)
        self.assertEqual(len(mm1), N)
        self.assertEqual(list(mm1), [37, 1, 43, 38, 16])

        mm2 = picker.LazyBitVectorPick(fps, len(fps), N, useCache=False)
        self.assertEqual(len(mm2), N)
        self.assertEqual(list(mm1), list(mm2))
示例#21
0
    def test6MultiFPBReaderTani(self):
        basen = os.path.join(RDConfig.RDBaseDir, 'Code', 'DataStructs',
                             'testData')
        mfpbr = DataStructs.MultiFPBReader()
        self.assertEqual(
            mfpbr.AddReader(
                DataStructs.FPBReader(
                    os.path.join(basen, "zinc_random200.1.patt.fpb"))), 1)
        self.assertEqual(
            mfpbr.AddReader(
                DataStructs.FPBReader(
                    os.path.join(basen, "zinc_random200.2.patt.fpb"))), 2)
        self.assertEqual(
            mfpbr.AddReader(
                DataStructs.FPBReader(
                    os.path.join(basen, "zinc_random200.3.patt.fpb"))), 3)
        self.assertEqual(
            mfpbr.AddReader(
                DataStructs.FPBReader(
                    os.path.join(basen, "zinc_random200.4.patt.fpb"))), 4)
        mfpbr.Init()
        self.assertEqual(mfpbr.GetNumBits(), 1024)
        self.assertEqual(len(mfpbr), 4)

        fps = "0000000000404000100000001000040000300040222000002004000240000020000000"+\
 "8200010200000090000024040860070044003214820000220401054008018000226000"+\
 "4800800140000042000080008008020482400000200410800000300430200800400000"+\
 "0000080a0000800400010c800200648818100010880040"
        ebv = DataStructs.CreateFromFPSText(fps)
        bytes = DataStructs.BitVectToBinaryText(ebv)
        nbrs = mfpbr.GetTanimotoNeighbors(bytes, threshold=0.6)
        self.assertEqual(len(nbrs), 6)
        self.assertAlmostEqual(nbrs[0][0], 0.66412, 4)
        self.assertEqual(nbrs[0][1], 0)
        self.assertEqual(nbrs[0][2], 3)
        self.assertAlmostEqual(nbrs[1][0], 0.65289, 4)
        self.assertEqual(nbrs[1][1], 1)
        self.assertEqual(nbrs[1][2], 2)
        self.assertAlmostEqual(nbrs[2][0], 0.64341, 4)
        self.assertEqual(nbrs[2][1], 2)
        self.assertEqual(nbrs[2][2], 1)
        self.assertAlmostEqual(nbrs[3][0], 0.61940, 4)
        self.assertEqual(nbrs[3][1], 1)
        self.assertEqual(nbrs[3][2], 0)
        self.assertAlmostEqual(nbrs[4][0], 0.61905, 4)
        self.assertEqual(nbrs[4][1], 0)
        self.assertEqual(nbrs[4][2], 0)
        self.assertAlmostEqual(nbrs[5][0], 0.61344, 4)
        self.assertEqual(nbrs[5][1], 0)
        self.assertEqual(nbrs[5][2], 1)

        # test multi-threaded (won't do anything if the RDKit isn't compiled with threads support)
        nbrs = mfpbr.GetTanimotoNeighbors(bytes, threshold=0.6, numThreads=4)
        self.assertEqual(len(nbrs), 6)
        self.assertAlmostEqual(nbrs[0][0], 0.66412, 4)
        self.assertEqual(nbrs[0][1], 0)
        self.assertEqual(nbrs[0][2], 3)
        self.assertAlmostEqual(nbrs[1][0], 0.65289, 4)
        self.assertEqual(nbrs[1][1], 1)
        self.assertEqual(nbrs[1][2], 2)
        self.assertAlmostEqual(nbrs[2][0], 0.64341, 4)
        self.assertEqual(nbrs[2][1], 2)
        self.assertEqual(nbrs[2][2], 1)
        self.assertAlmostEqual(nbrs[3][0], 0.61940, 4)
        self.assertEqual(nbrs[3][1], 1)
        self.assertEqual(nbrs[3][2], 0)
        self.assertAlmostEqual(nbrs[4][0], 0.61905, 4)
        self.assertEqual(nbrs[4][1], 0)
        self.assertEqual(nbrs[4][2], 0)
        self.assertAlmostEqual(nbrs[5][0], 0.61344, 4)
        self.assertEqual(nbrs[5][1], 0)
        self.assertEqual(nbrs[5][2], 1)
示例#22
0
    def split(self,
              dataset,
              seed=None,
              frac_train=.8,
              frac_valid=.1,
              frac_test=.1,
              log_every_n=1000):
        """
        Splits internal compounds into train/validation/test by fingerprint.
    """
        np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
        data_len = len(dataset)
        mols, fingerprints = [], []
        train_inds, valid_inds, test_inds = [], [], []
        from rdkit import Chem
        from rdkit.Chem.Fingerprints import FingerprintMols
        for ind, smiles in enumerate(dataset.ids):
            mol = Chem.MolFromSmiles(smiles, sanitize=False)
            mols.append(mol)
            fp = FingerprintMols.FingerprintMol(mol)
            fingerprints.append(fp)

        distances = np.ones(shape=(data_len, data_len))
        from rdkit import DataStructs
        for i in range(data_len):
            for j in range(data_len):
                distances[i][j] = 1 - DataStructs.FingerprintSimilarity(
                    fingerprints[i], fingerprints[j])

        train_cutoff = int(frac_train * len(dataset))
        valid_cutoff = int(frac_valid * len(dataset))

        # Pick the mol closest to everything as the first element of training
        closest_ligand = np.argmin(np.sum(distances, axis=1))
        train_inds.append(closest_ligand)
        cur_distances = [float('inf')] * data_len
        self.update_distances(closest_ligand, cur_distances, distances,
                              train_inds)
        for i in range(1, train_cutoff):
            closest_ligand = np.argmin(cur_distances)
            train_inds.append(closest_ligand)
            self.update_distances(closest_ligand, cur_distances, distances,
                                  train_inds)

        # Pick the closest mol from what is left
        index, best_dist = 0, float('inf')
        for i in range(data_len):
            if i in train_inds:
                continue
            dist = np.sum(distances[i])
            if dist < best_dist:
                index, best_dist = i, dist
        valid_inds.append(index)

        leave_out_indexes = train_inds + valid_inds
        cur_distances = [float('inf')] * data_len
        self.update_distances(index, cur_distances, distances,
                              leave_out_indexes)
        for i in range(1, valid_cutoff):
            closest_ligand = np.argmin(cur_distances)
            valid_inds.append(closest_ligand)
            leave_out_indexes.append(closest_ligand)
            self.update_distances(closest_ligand, cur_distances, distances,
                                  leave_out_indexes)

        # Test is everything else
        for i in range(data_len):
            if i in leave_out_indexes:
                continue
            test_inds.append(i)
        return train_inds, valid_inds, test_inds
示例#23
0
 def compute_similarity(self, fp_ref):
     self.fp_ref = fp_ref
     self.similarity = DataStructs.FingerprintSimilarity(
         self.fp_ref, self.fp)
     return self.similarity
示例#24
0
 def distance(i, j):
     return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])
示例#25
0
 def test5FromBitString(self):
     s1 = '1010'
     bv = DataStructs.CreateFromBitString(s1)
     self.failUnless(len(bv) == 4)
     self.failUnless(list(bv.GetOnBits()) == [0, 2])
示例#26
0
def cluster_from_mol_list(mol_list,
                          cutoff=0.8,
                          fp="ecfp6",
                          activity_prop=None,
                          summary_only=True,
                          generate_cores=False,
                          align_to_core=False):
    """Clusters the input Mol_List.

    Parameters:
        mol_list (tools.Mol_List): the input molecule list.
        cutoff (float): similarity cutoff for putting molecules into the same cluster.

    Returns:
        A new Mol_List containing the input molecules with their respective cluster number,
        as well as additionally the cluster cores, containing some statistics."""

    try:
        fp_func = FPDICT[fp]
    except KeyError:
        print(
            "Fingerprint {} not found. Available fingerprints are: {}".format(
                fp, ", ".join(sorted(FPDICT.keys()))))
        return

    counter = Counter()

    # generate the fingerprints
    fp_list = [fp_func(mol) for mol in mol_list]

    # second generate the distance matrix:
    dists = []
    num_of_fps = len(fp_list)
    for i in range(1, num_of_fps):
        sims = DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i])
        dists.extend([1 - x for x in sims])

    # now cluster the data:
    cluster_idx_list = Butina.ClusterData(dists,
                                          num_of_fps,
                                          cutoff,
                                          isDistData=True)
    for cluster in cluster_idx_list:
        counter[len(cluster)] += 1
    print("    fingerprint:", fp)
    print("    clustersize  num_of_clusters")
    print("    ===========  ===============")
    for length in sorted(counter.keys(), reverse=True):
        print("        {:4d}            {:3d}".format(length, counter[length]))
    print()

    if summary_only:
        return None

    cluster_list = tools.Mol_List()

    # go over each list of indices to collect the cluster's molecules
    for cl_id, idx_list in enumerate(
            sorted(cluster_idx_list, key=len, reverse=True), 1):
        cluster = get_mol_list_from_index_list(mol_list, idx_list, cl_id)
        cluster[0].SetProp(
            "is_repr",
            "yes")  # The first compound in a cluster is the representative
        cluster_list.extend(cluster)

    if generate_cores:
        cluster_list = add_cores(cluster_list, activity_prop, align_to_core)

    return cluster_list
示例#27
0
def _tanimoto_worker(k, fps):
    """Get per-fingerprint Tanimoto distance vector."""
    # pylint: disable=no-member
    sims = DataStructs.BulkTanimotoSimilarity(fps[k], fps[(k + 1):])
    dists_k = [1. - s for s in sims]
    return np.array(dists_k), 0
示例#28
0
 def taniFunc(i, j, bvs=vs):
   d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j])
   return d
示例#29
0
 def testHashedTopologicalTorsions(self):
     mol = Chem.MolFromSmiles("c1ncccc1")
     fp1 = rdMD.GetHashedTopologicalTorsionFingerprint(mol)
     mol = Chem.MolFromSmiles("n1ccccc1")
     fp2 = rdMD.GetHashedTopologicalTorsionFingerprint(mol)
     self.assertEqual(DataStructs.DiceSimilarity(fp1, fp2), 1.0)
示例#30
0
 def func(i, j, bvs=vs):
   d = DataStructs.TanimotoSimilarity(bvs[i], bvs[j], returnDistance=True)
   return d