def test7FPS(self): bv = DataStructs.ExplicitBitVect(32) bv.SetBit(0) bv.SetBit(1) bv.SetBit(17) bv.SetBit(23) bv.SetBit(31) self.assertEqual(DataStructs.BitVectToFPSText(bv), "03008280") bv2 = DataStructs.CreateFromFPSText("03008280") self.assertEqual(bv, bv2) self.assertRaises(ValueError, lambda: DataStructs.CreateFromFPSText("030082801")) bv2 = DataStructs.CreateFromFPSText("") self.assertEqual(bv2.GetNumBits(), 0)
def testBitVectorMaxMin4(self): # threshold tests fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data', 'chembl_cyps.head.fps') fps = [] with open(fname) as infil: for line in infil: fp = DataStructs.CreateFromFPSText(line.strip()) fps.append(fp) mmp = rdSimDivPickers.MaxMinPicker() ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, -1.0, seed=42) self.assertEqual(list(ids), [ 374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869, 407 ]) self.assertAlmostEqual(threshold, 0.8977, 4) ids, threshold = mmp.LazyBitVectorPickWithThreshold(fps, len(fps), 20, 0.91, seed=42) self.assertEqual( list(ids), [374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630]) self.assertTrue(threshold >= 0.91)
def testBitVectorMaxMin3(self): fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data', 'chembl_cyps.head.fps') fps = [] with open(fname) as infil: for line in infil: fp = DataStructs.CreateFromFPSText(line.strip()) fps.append(fp) mmp = rdSimDivPickers.MaxMinPicker() ids = list(mmp.LazyBitVectorPick(fps, len(fps), 20, seed=42)) self.assertEqual(ids, [ 374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869, 407 ]) ids = list( mmp.LazyBitVectorPick(fps, len(fps), 20, firstPicks=[374, 720, 690, 339, 875], seed=42)) self.assertEqual(ids, [ 374, 720, 690, 339, 875, 842, 404, 725, 120, 385, 115, 868, 630, 881, 516, 497, 412, 718, 869, 407 ])
def test9MultiFPBReaderEdges(self): basen = os.path.join(RDConfig.RDBaseDir, 'Code', 'DataStructs', 'testData') mfpbr = DataStructs.MultiFPBReader() mfpbr.Init() fps = "0000000000404000100000001000040000300040222000002004000240000020000000"+\ "8200010200000090000024040860070044003214820000220401054008018000226000"+\ "4800800140000042000080008008020482400000200410800000300430200800400000"+\ "0000080a0000800400010c800200648818100010880040" ebv = DataStructs.CreateFromFPSText(fps) bytes = DataStructs.BitVectToBinaryText(ebv) nbrs = mfpbr.GetTanimotoNeighbors(bytes, threshold=0.6) self.assertEqual(len(nbrs), 0)
def test8MultiFPBReaderContainsInitOnSearch(self): basen = os.path.join(RDConfig.RDBaseDir, 'Code', 'DataStructs', 'testData') mfpbr = DataStructs.MultiFPBReader(initOnSearch=True) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.1.patt.fpb"))), 1) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.2.patt.fpb"))), 2) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.3.patt.fpb"))), 3) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.4.patt.fpb"))), 4) fps = "40081010824820021000500010110410003000402b20285000a4040240010030050000"+\ "080001420040009000003d04086007080c03b31d920004220400074008098010206080"+\ "00488001080000c64002a00080000200024c2000602410049200340820200002400010"+\ "02200106090401056801080182006088101000088a0048" ebv = DataStructs.CreateFromFPSText(fps) bytes = DataStructs.BitVectToBinaryText(ebv) nbrs = mfpbr.GetContainingNeighbors(bytes, numThreads=4) self.assertEqual(len(nbrs), 9) self.assertEqual(nbrs[0][0], 160) self.assertEqual(nbrs[0][1], 0) self.assertEqual(nbrs[1][0], 163) self.assertEqual(nbrs[1][1], 0) self.assertEqual(nbrs[2][0], 170) self.assertEqual(nbrs[2][1], 0) self.assertEqual(nbrs[3][0], 180) self.assertEqual(nbrs[3][1], 2) self.assertEqual(nbrs[4][0], 182) self.assertEqual(nbrs[4][1], 3) self.assertEqual(nbrs[5][0], 185) self.assertEqual(nbrs[5][1], 0) self.assertEqual(nbrs[6][0], 189) self.assertEqual(nbrs[6][1], 0) self.assertEqual(nbrs[7][0], 192) self.assertEqual(nbrs[7][1], 3) self.assertEqual(nbrs[8][0], 193) self.assertEqual(nbrs[8][1], 0)
def testBitVectorLeader1(self): # threshold tests fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data', 'chembl_cyps.head.fps') fps = [] with open(fname) as infil: for line in infil: fp = DataStructs.CreateFromFPSText(line.strip()) fps.append(fp) mmp = rdSimDivPickers.LeaderPicker() thresh = 0.8 ids = mmp.LazyBitVectorPick(fps, len(fps), thresh) self.assertEqual(len(ids), 146) for i in range(len(ids)): for j in range(i): self.assertGreaterEqual( 1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]), thresh) thresh = 0.9 ids = mmp.LazyBitVectorPick(fps, len(fps), thresh) self.assertEqual(len(ids), 14) for i in range(len(ids)): for j in range(i): self.assertGreaterEqual( 1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]), thresh) ids = mmp.LazyBitVectorPick(fps, len(fps), thresh, pickSize=10) self.assertEqual(len(ids), 10) for i in range(len(ids)): for j in range(i): self.assertGreaterEqual( 1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]), thresh)
def test6MultiFPBReaderTani(self): basen = os.path.join(RDConfig.RDBaseDir, 'Code', 'DataStructs', 'testData') mfpbr = DataStructs.MultiFPBReader() self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.1.patt.fpb"))), 1) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.2.patt.fpb"))), 2) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.3.patt.fpb"))), 3) self.assertEqual( mfpbr.AddReader( DataStructs.FPBReader( os.path.join(basen, "zinc_random200.4.patt.fpb"))), 4) mfpbr.Init() self.assertEqual(mfpbr.GetNumBits(), 1024) self.assertEqual(len(mfpbr), 4) fps = "0000000000404000100000001000040000300040222000002004000240000020000000"+\ "8200010200000090000024040860070044003214820000220401054008018000226000"+\ "4800800140000042000080008008020482400000200410800000300430200800400000"+\ "0000080a0000800400010c800200648818100010880040" ebv = DataStructs.CreateFromFPSText(fps) bytes = DataStructs.BitVectToBinaryText(ebv) nbrs = mfpbr.GetTanimotoNeighbors(bytes, threshold=0.6) self.assertEqual(len(nbrs), 6) self.assertAlmostEqual(nbrs[0][0], 0.66412, 4) self.assertEqual(nbrs[0][1], 0) self.assertEqual(nbrs[0][2], 3) self.assertAlmostEqual(nbrs[1][0], 0.65289, 4) self.assertEqual(nbrs[1][1], 1) self.assertEqual(nbrs[1][2], 2) self.assertAlmostEqual(nbrs[2][0], 0.64341, 4) self.assertEqual(nbrs[2][1], 2) self.assertEqual(nbrs[2][2], 1) self.assertAlmostEqual(nbrs[3][0], 0.61940, 4) self.assertEqual(nbrs[3][1], 1) self.assertEqual(nbrs[3][2], 0) self.assertAlmostEqual(nbrs[4][0], 0.61905, 4) self.assertEqual(nbrs[4][1], 0) self.assertEqual(nbrs[4][2], 0) self.assertAlmostEqual(nbrs[5][0], 0.61344, 4) self.assertEqual(nbrs[5][1], 0) self.assertEqual(nbrs[5][2], 1) # test multi-threaded (won't do anything if the RDKit isn't compiled with threads support) nbrs = mfpbr.GetTanimotoNeighbors(bytes, threshold=0.6, numThreads=4) self.assertEqual(len(nbrs), 6) self.assertAlmostEqual(nbrs[0][0], 0.66412, 4) self.assertEqual(nbrs[0][1], 0) self.assertEqual(nbrs[0][2], 3) self.assertAlmostEqual(nbrs[1][0], 0.65289, 4) self.assertEqual(nbrs[1][1], 1) self.assertEqual(nbrs[1][2], 2) self.assertAlmostEqual(nbrs[2][0], 0.64341, 4) self.assertEqual(nbrs[2][1], 2) self.assertEqual(nbrs[2][2], 1) self.assertAlmostEqual(nbrs[3][0], 0.61940, 4) self.assertEqual(nbrs[3][1], 1) self.assertEqual(nbrs[3][2], 0) self.assertAlmostEqual(nbrs[4][0], 0.61905, 4) self.assertEqual(nbrs[4][1], 0) self.assertEqual(nbrs[4][2], 0) self.assertAlmostEqual(nbrs[5][0], 0.61344, 4) self.assertEqual(nbrs[5][1], 0) self.assertEqual(nbrs[5][2], 1)
def transform(infile, outfile, outformat, delimiter, numericID, numlines, verbose, debug): ins = open(infile) outs = open(outfile, 'w') # inform if debug or verbose: print( "About to read rdkit fingerprint file {} and transform to bivector file ... {}" .format(infile, outfile)) # run nid = numericID ctr = 0 # vlen = 0 for line in ins: line.replace(r'\r', '') # if comment line ignore if line[0] == '#': if debug: print(line) continue # split line chunky_line = string.split(line, delimiter) # get rid of the newline character if it exists if chunky_line[-1][-1] == '\n' or chunky_line[-1][-1] == '^M': chunky_line[-1] = chunky_line[-1][:-1] # get the fp needed fp = DataStructs.CreateFromFPSText(chunky_line[0]) bs = fp.ToBitString() if nid != -99999999: pcn = nid nid += 1 else: try: pcn = chunky_line[1] except: #raise IOError("Line " + str(ctr) + " missing id element...") print("Line " + str(ctr) + " missing id element...") continue # write if outformat: outs.write("$SMI<C>\n") outs.write("PCN<" + str(pcn) + ">\n") outs.write("FPRDK<" + bs + ">\n") outs.write("|\n") else: outs.write(bs + " " + str(pcn) + "\n") if debug: print(len(bs)) if verbose and ctr % numlines == 0: print("Processed {} fingerprints...".format(ctr)) ctr += 1 # inform if debug or verbose: print("\nRead/transformed/wrote {} fingerprints.".format(ctr)) ins.close() return