def test_output_descriptor_duplicates(self): """test output for descriptor duplicates""" self.defineConfig() structure_data = tuner.helper.read_csv("tests/structure_preparation_test/reference_set.csv") ecfp = tuner.run_fingerprint(structure_data['smiles'], 1) df_processed_desc = tuner.output_processed_descriptors(ecfp, structure_data) structure_data_duplicates = tuner.output_descriptor_duplicates(df_processed_desc) self.assertEqual(len(structure_data_duplicates), 0)
def test_output_descriptor_duplicates_ref_file_ecfp1(self): """test output for descriptor duplicates with fuzzier fingerprint """ self.defineConfig(fp=1) with open(self.referenceFilePathDuplicates,"r") as h: smiles=[line.strip() for line in h.readlines()] structure_data=pd.DataFrame(smiles,columns=["smiles"]) ecfp = tuner.run_fingerprint(structure_data['smiles'], 1) df_processed_desc = tuner.output_processed_descriptors(ecfp, structure_data) structure_data_duplicates = tuner.output_descriptor_duplicates(df_processed_desc) self.assertEqual(len(structure_data_duplicates), 19)
def test_lsh_folding(self): self.defineConfig() with open(self.referenceFilePathDuplicates,"r") as h: smiles=[line.strip() for line in h.readlines()] structure_data=pd.DataFrame(smiles,columns=["smiles"]) ecfp = tuner.run_fingerprint(structure_data['smiles'], 1) lsh_folding = tuner.LSHFolding() df_high_entropy_bits = lsh_folding.calc_highest_entropy_bits(ecfp) #df_high_entropy_bits.to_pickle("unit_test/output/df_high_entropy_bits.pkl") #reference results df_high_entropy_bits_ref=pd.read_pickle("unit_test/output/df_high_entropy_bits.pkl") df_folds = lsh_folding.run_lsh_calculation(ecfp) #df_folds.to_pickle("unit_test/output/df_folds.pkl") #reference results df_folds_ref = pd.read_pickle("unit_test/output/df_folds.pkl") self.assertEqual(df_high_entropy_bits.equals(df_high_entropy_bits_ref) & df_folds.equals(df_folds_ref),True)