def SBDFP_Calc(DF, FP="ECFP4", FORMAT="RDKit"): # This function requires a dataframe with a column identified as "ECFP4FP" or "MACCSFP" containing the respective fingerprints as RDKit objects # The function also requires the files ECFP4.counts or MACCS.counts that contain the "1" Bit counts for the respective fingerprints # The input dataframe can be taken from the LoadDatasetFromCSV function # FP = "ECFP4" or "MACCS" according to the respective SB-DFP # FORMAT = "RDKit" or "TEXT" according to the output format, RDKit object or TEXT string if FP == "ECFP4": FPSTEXT = [DataStructs.BitVectToText(row.ECFP4FP) for index, row in DF.iterrows()] DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))] for i in FPSTEXT: b = [int(j) for j in i] DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)] REF = open("ECFP4.counts") line = REF.readline() a = line.split(",") REF_COUNTS = [int(x) for x in a] SBDFP = [] for i in range(len(REF_COUNTS)): stat, pval = proportions_ztest([REF_COUNTS[i], DF_COUNTS[i]], [15403690,DF.shape[0]], alternative='smaller') if pval < 0.01: SBDFP.append(1) else: SBDFP.append(0) SBDFP = [str(x) for x in SBDFP] SBDFP = "".join(SBDFP) SBDFP_RDKIT = DataStructs.CreateFromBitString(SBDFP) elif FP == "MACCS": FPSTEXT = [DataStructs.BitVectToText(row.MACCSFP) for index, row in DF.iterrows()] DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))] for i in FPSTEXT: b = [int(j) for j in i] DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)] REF = open("MACCS.counts") line = REF.readline() a = line.split(",") REF_COUNTS = [int(x) for x in a] SBDFP = [] for i in range(len(REF_COUNTS)): stat, pval = proportions_ztest([REF_COUNTS[i], DF_COUNTS[i]], [15403690,DF.shape[0]], alternative='smaller') if pval < 0.01: SBDFP.append(1) else: SBDFP.append(0) SBDFP = [str(x) for x in SBDFP] SBDFP = "".join(SBDFP) SBDFP_RDKIT = DataStructs.CreateFromBitString(SBDFP) if FORMAT == "RDKit": return SBDFP_RDKIT elif FORMAT == "TEXT": return SBDFP
def DFP_Calc(DF, FP="ECFP4", FORMAT="RDKit"): # This function requires a dataframe with a column identified as "ECFP4FP" or "MACCSFP" containing the respective fingerprints as RDKit objects # The input dataframe can be taken from the LoadDatasetFromCSV function # FP = "ECFP4" or "MACCS" according to the respective DFP # FORMAT = "RDKit" or "TEXT" according to the output format, RDKit object or TEXT string if FP == "ECFP4": FPSTEXT = [DataStructs.BitVectToText(row.ECFP4FP) for index, row in DF.iterrows()] DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))] for i in FPSTEXT: b = [int(j) for j in i] DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)] DF_PROPORTIONS = [float(x)/DF.shape[0] for x in DF_COUNTS] DFP = [] for i in range(0, len(DF_PROPORTIONS)): if DF_PROPORTIONS[i] > 0.5: DFP.append(1) else: DFP.append(0) DFP = [str(i) for i in DFP] DFP = "".join(DFP) DFP_RDKIT = DataStructs.CreateFromBitString(DFP) elif FP == "MACCS": FPSTEXT = [DataStructs.BitVectToText(row.MACCSFP) for index, row in DF.iterrows()] DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))] for i in FPSTEXT: b = [int(j) for j in i] DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)] DF_PROPORTIONS = [float(x)/DF.shape[0] for x in DF_COUNTS] DFP = [] for i in range(0, len(DF_PROPORTIONS)): if DF_PROPORTIONS[i] > 0.5: DFP.append(1) else: DFP.append(0) DFP = [str(i) for i in DFP] DFP = "".join(DFP) DFP_RDKIT = DataStructs.CreateFromBitString(DFP) if FORMAT == "RDKit": return DFP_RDKIT elif FORMAT == "TEXT": return DFP
def testPattern(ptfile, bit=False): """ Test how to reload PatternFingerprint """ print('Validating fingerprint...') data = np.load(ptfile, allow_pickle=True) fps = data['x'] fpNames = data['y'] if bit: fp = [DataStructs.CreateFromBitString(z) for z in fps] else: fp = fps sim = DataStructs.BulkTanimotoSimilarity(fp[0], list(fp)) return fp
def loadFingerprint(datadir, fpid): fpi = fingerprint()[fpid] fpfile = os.path.join(datadir, fpi[0] + '.npz') data = np.load(fpfile, allow_pickle=True) fp = data['x'] fpn = data['y'] fpparam = fpi[1] # Some fingerprints are stored as bit strings if fpi[2] == True: fp = [DataStructs.CreateFromBitString(z) for z in fp] fpfun = fpi[3] data.close() return fp, fpn, fpparam, fpfun
def testBitVectorMaxMin2(self): fps = [ "11110010101000000000", "00000000000010010000", "11001010000000000001", "00100110101000001000", "01010110000100011001", "11000110101001000011", "00000000001100001111", "00011110110000001101", "00000011011110100010", "11000010110001000000", "00000100010000010000", "10000001000010110010", "00010010000000010100", "00011100100110101000", "10001001100110100000", "10000110100110010000", "00101110000101000000", "11011101100011100000", "10000110000100101000", "00101000100000010001", "01000001000010000000", "00101101010100000110", "10001000100110110001", "00011000010100000001", "00101000001000100011", "00010000100010011001", "01100001000100010001", "10000101000001101101", "00001000011001011000", "11110000100100100000", "10100110000000011010", "00110100010110010010", "00000000000001010010", "00100000000010100001", "11110011000010001000", "10110001010100001000", "00001100100110011011", "00010010100100001110", "10100101100010100010", "01100100010100000001", "10101110011100000000", "01011000000001000001", "00000011100110100010", "01100001010001001001", "00001000000001001100", "10011001110000000100", "10110000001001100100", "00011000000001001011", "11001011010001100010", "10010000000001001011", "00010000100111100000", "00001000001110001000", "11010000010001100110", "01101001100000111000", "01001000001110111000", "10000000000100010010", "11001000010010000000", "01010010000100110001", "00010001010100100001", "01110010000000010000", "10001010000011000001", "00000110000000100100", "00010000010001000000", "11101100011010000011", "00000010100001010001", "00010000110010000101", "00010001001000111001", "01000010001100100110", "00110110000000100001", "00100010010110110010", "01000000110011001111", "00011000001000110010", "01111010101000110100", "00001010000010110110", "00110011000011011010", "00111010111010000110", "00010011101010000011", "00000001011000010000", "00011011101110110000", "00010001101000000001", "00010000001010011010", "00000010100100100010", "00000010001011000100", "11010000000001011100", "00001000110101000001", "00000010000000110010", "10000000010011000001", "11110110100100010000", "10001111000110001001", "00100110000110000100", "00000100100000100100", "00110000101100010100", "00001010100000100000", "01011000000011000111", "00010000100001010001", "10000010100000010000", "00001000000000110010", "00001000101011010001", "00011110000100100000", "11001001010001010100" ] N = 5 fps = [DataStructs.CreateFromBitString(x) for x in fps] picker = rdSimDivPickers.MaxMinPicker() mm1 = picker.LazyBitVectorPick(fps, len(fps), N, seed=42) self.assertEqual(len(mm1), N) self.assertEqual(list(mm1), [37, 1, 43, 38, 16]) mm2 = picker.LazyBitVectorPick(fps, len(fps), N, useCache=False, seed=42) self.assertEqual(len(mm2), N) self.assertEqual(list(mm1), list(mm2))
def test5FromBitString(self): s1 = '1010' bv = DataStructs.CreateFromBitString(s1) self.failUnless(len(bv) == 4) self.failUnless(list(bv.GetOnBits()) == [0, 2])
def fpList_to_bit(fp_list): return DataStructs.CreateFromBitString("".join(fp_list))
def convert_to_fp(bitstring): return DataStructs.CreateFromBitString(bitstring)