def test__parallel_fingerprinter(self): fps_from_sgl = \ CircularFPFeaturizer(n_jobs=1, fp_mode="binary_folded", output_format="dense").fit_transform(self.smis) fps_from_par = \ CircularFPFeaturizer(n_jobs=4, fp_mode="binary_folded", output_format="dense").fit_transform(self.smis) self.assertEqual((self.n_mols, 2048), fps_from_par.shape) np.testing.assert_equal(fps_from_sgl, fps_from_par)
def test__determine_not_fitted_yet(self) -> None: fprinter = CircularFPFeaturizer(only_freq_subs=True) with self.assertRaises(NotFittedError): len(fprinter) with self.assertRaises(NotFittedError): fprinter.transform(self.mols) self.assertEqual(len(fprinter.fit(self.mols)), 86)
def test__string_output_format__only_freq_subs(self) -> None: fprintr = CircularFPFeaturizer(output_format="sparse_string", only_freq_subs=True) fps_str = fprintr.fit_transform(self.smis) # using SMILES # Output shape self.assertEqual(self.n_mols, len(fps_str)) # Fingerprint matrix structure for i, mol in enumerate(self.mols): for k in eval("{" + fps_str[i] + "}"): self.assertTrue(0 <= k < len(fprintr))
def test__count_and_filter_hashes__ints_vs_floats(self): d = [ {"A": 0, "B": 0, "C": 0, "D": 0}, {"A": 0 , "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, {"A": 0, "B": 0, "C": 0, "D": 0}, { "D": 0} ] # A = 3 / 8, B = 6 / 8 and C = 7 / 8 self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 1)[0]), 4) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 1.0)[0]), 1)
def test__error_when_parsing_smiles(self) -> None: with self.assertRaises(RuntimeError): CircularFPFeaturizer().fit_transform([ "O=C(O)C1OC(Oc2c(-c3ccc(O)c(O)c3)oc3cc(O)cc(O)c3c2=O)C(O)C(O)C1O", "Oc1cc(O)c2c(c1)OC1(c3ccc(O)c(O)c3)Oc3cc(O)c4c(c3C2C1O)OC(c1ccc(O)c(O)c1)C(O)C4", "COc1cc(O)c2c(=O)c(O)c(-c3ccc(O)c(O)c3)oc2c1", "CaC1asfOC(O)C(O)C(O)C1O"])
def test__string_output_format(self) -> None: fprintr = CircularFPFeaturizer(output_format="sparse_string") fps_str = fprintr.fit_transform(self.smis) # using SMILES # Output shape self.assertEqual(self.n_mols, len(fps_str)) # Fingerprint matrix structure for i, mol in enumerate(self.mols): fps_ref = GetMorganFingerprint(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality, useCounts=fprintr.use_counts_) fp_i_from_str = eval("{" + fps_str[i] + "}") for hash, cnt in fps_ref.GetNonzeroElements().items(): self.assertEqual(fp_i_from_str[hash], cnt)
def test__to_dense_output(self) -> None: # Output to large to be converted to a dense matrix fpr_mat = CircularFPFeaturizer(fp_mode="binary_folded", output_format="dense", n_bits_folded=2048, max_n_bits_for_dense_output=2048).fit_transform(self.mols) self.assertFalse(isspmatrix_csr(fpr_mat)) self.assertTrue(isinstance(fpr_mat, np.ndarray)) # Output to large to be converted to a dense matrix fpr_mat = CircularFPFeaturizer(fp_mode="binary_folded", output_format="dense", n_bits_folded=2048, max_n_bits_for_dense_output=100).fit_transform(self.mols) self.assertTrue(isspmatrix_csr(fpr_mat)) self.assertFalse(isinstance(fpr_mat, np.ndarray)) # Save-guard works for hashed fingerprints fpr_mat = CircularFPFeaturizer(output_format="dense").fit_transform(self.mols) self.assertTrue(isspmatrix_csr(fpr_mat)) self.assertFalse(isinstance(fpr_mat, np.ndarray))
def test__string_output_format__binary(self) -> None: fprintr = CircularFPFeaturizer(output_format="sparse_string", fp_mode="binary_folded") fps_str = fprintr.fit_transform(self.smis) # using SMILES # Output shape self.assertEqual(self.n_mols, len(fps_str)) # Fingerprint matrix structure for i, mol in enumerate(self.mols): fps_ref = GetMorganFingerprintAsBitVect( mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality, nBits=fprintr.n_bits_ ) fp_i_from_str = eval("{" + fps_str[i] + "}") for idx in fps_ref.GetOnBits(): self.assertIn(idx, fp_i_from_str)
def test__hashed_counting_fingerprints__fcfp(self) -> None: fprintr = CircularFPFeaturizer(fp_type="FCFP") fps_mat_smi = fprintr.fit_transform(self.smis) # using SMILES fps_mat_mol = fprintr.fit_transform(self.mols) # using Mol objects # Output shape self.assertEqual(fps_mat_smi.shape[0], self.n_mols) self.assertEqual(fps_mat_smi.shape[1], fprintr.max_hash_value_) self.assertEqual(fps_mat_mol.shape[0], self.n_mols) self.assertEqual(fps_mat_mol.shape[1], fprintr.max_hash_value_) # Fingerprint matrix structure for i, mol in enumerate(self.mols): fps_ref = GetMorganFingerprint(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality, useCounts=fprintr.use_counts_) for hash, cnt in fps_ref.GetNonzeroElements().items(): self.assertEqual(fps_mat_smi[i, hash], cnt) self.assertEqual(fps_mat_mol[i, hash], cnt)
def test__hashed_binary_fingerprints__ecfp(self) -> None: fprintr = CircularFPFeaturizer(fp_mode="binary") fps_mat_smi = fprintr.fit_transform(self.smis) # using SMILES fps_mat_mol = fprintr.fit_transform(self.mols) # using Mol objects # Output shape self.assertEqual(fps_mat_smi.shape[0], self.n_mols) self.assertEqual(fps_mat_smi.shape[1], fprintr.max_hash_value_) self.assertEqual(fps_mat_mol.shape[0], self.n_mols) self.assertEqual(fps_mat_mol.shape[1], fprintr.max_hash_value_) # Fingerprint matrix structure for i, mol in enumerate(self.mols): fps_ref = GetMorganFingerprint(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality, useCounts=fprintr.use_counts_) for hash in fps_ref.GetNonzeroElements(): self.assertTrue(fps_mat_smi[i, hash]) self.assertTrue(fps_mat_mol[i, hash]) # No other elements are set self.assertEqual(np.sum(fps_mat_smi[i, :].data), len(fps_ref.GetNonzeroElements())) self.assertEqual(np.sum(fps_mat_mol[i, :].data), len(fps_ref.GetNonzeroElements()))
def test__folded_binary_fingerprints__ecfp(self) -> None: fprintr = CircularFPFeaturizer(fp_mode="binary_folded", n_bits_folded=512) fps_mat_smi = fprintr.fit_transform(self.smis) # using SMILES fps_mat_mol = fprintr.fit_transform(self.mols) # using Mol objects # Output shape self.assertEqual(fps_mat_smi.shape[0], self.n_mols) self.assertEqual(fps_mat_smi.shape[1], fprintr.n_bits_folded) self.assertEqual(fps_mat_mol.shape[0], self.n_mols) self.assertEqual(fps_mat_mol.shape[1], fprintr.n_bits_folded) # Fingerprint matrix structure for i, mol in enumerate(self.mols): fps_ref = GetMorganFingerprintAsBitVect(mol, radius=fprintr.radius, useFeatures=fprintr.use_features_, useChirality=fprintr.use_chirality, nBits=fprintr.n_bits_folded) on_bits = list(fps_ref.GetOnBits()) for j in range(fprintr.n_bits_folded): if j in on_bits: self.assertTrue(fps_mat_smi[i, j]) self.assertTrue(fps_mat_mol[i, j]) else: self.assertFalse(fps_mat_smi[i, j]) self.assertFalse(fps_mat_mol[i, j])
def test__train_with_frequent_substrset(self) -> None: # appears on ALL molecules self.assertEqual(len(CircularFPFeaturizer(only_freq_subs=True, min_subs_freq=1.0).fit(self.mols)), 0) # All data for fit and transform n_old = np.inf for freq in np.arange(0, 1.01, 0.01): fprinter = CircularFPFeaturizer(only_freq_subs=True, min_subs_freq=freq).fit(self.mols) # set of frequent pattern should get smaller when we require the patterns to be appear in more molecules n_new = len(fprinter) self.assertTrue(n_new <= n_old) n_old = n_new # Check dimension of transformed output fps_mat = fprinter.transform(self.mols) self.assertEqual(len(fprinter), fps_mat.shape[1]) # Check frequency of substructures in the output freq_hash_set_inv = {v: k for k, v in fprinter.freq_hash_set_.items()} for j in range(len(fprinter)): h = freq_hash_set_inv[j] self.assertTrue(np.sum(fps_mat[:, j].data > 0) / self.n_mols >= fprinter.hash_cnts_filtered_[h][1]) # Half of the data for fit and the other half for transform n_old = np.inf for freq in np.arange(0, 1.01, 0.01): fprinter = CircularFPFeaturizer(only_freq_subs=True, min_subs_freq=freq).fit(self.mols[:7]) # set of frequent pattern should get smaller when we require the patterns to be appear in more molecules n_new = len(fprinter) self.assertTrue(n_new <= n_old) n_old = n_new # Check dimension of transformed output fps_mat = fprinter.transform(self.mols[7:]) self.assertEqual(len(fprinter), fps_mat.shape[1])
def test__rdkit_parameters_are_correct(self) -> None: fprintr = CircularFPFeaturizer(fp_type="ECFP", fp_mode="count").fit(None) self.assertTrue(fprintr.use_counts_) self.assertFalse(fprintr.use_features_) fprintr = CircularFPFeaturizer(fp_type="FCFP", fp_mode="count").fit(None) self.assertTrue(fprintr.use_counts_) self.assertTrue(fprintr.use_features_) fprintr = CircularFPFeaturizer(fp_type="ECFP", fp_mode="binary").fit(None) self.assertFalse(fprintr.use_counts_) self.assertFalse(fprintr.use_features_) fprintr = CircularFPFeaturizer(fp_type="FCFP", fp_mode="binary").fit(None) self.assertFalse(fprintr.use_counts_) self.assertTrue(fprintr.use_features_) fprintr = CircularFPFeaturizer(fp_type="ECFP", fp_mode="binary_folded").fit(None) self.assertFalse(fprintr.use_counts_) self.assertFalse(fprintr.use_features_) fprintr = CircularFPFeaturizer(fp_type="FCFP", fp_mode="binary_folded").fit(None) self.assertFalse(fprintr.use_counts_) self.assertTrue(fprintr.use_features_)
def test__sklearn_clone(self): fprinter = CircularFPFeaturizer() _ = clone(fprinter)
def test__sklearn_get_params(self): fprinter = CircularFPFeaturizer() print(fprinter.get_params())
def test__count_and_filter_hashes__with_ints(self) -> None: d = [ {"A": 0, "B": 0, "C": 0, "D": 0}, {"A": 0 , "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, {"A": 0, "B": 0, "C": 0, "D": 0}, { "D": 0} ] # A = 3 / 8, B = 6 / 8 and C = 7 / 8 # Must appear in at least 1 time --> result should contain all keys self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 1)[0]), 4) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 1)[0]["A"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 1)[0]["B"], 1) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 1)[0]["C"], 2) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 1)[0]["D"], 3) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 2)[0]), 4) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 2)[0].keys(), {"A", "B", "C", "D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 2)[0]["A"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 2)[0]["B"], 1) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 2)[0]["C"], 2) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 2)[0]["D"], 3) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 3)[0]), 4) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 3)[0].keys(), {"A", "B", "C", "D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 3)[0]["A"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 3)[0]["B"], 1) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 3)[0]["C"], 2) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 3)[0]["D"], 3) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 4)[0]), 3) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 4)[0].keys(), {"B", "C", "D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 4)[0]["B"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 4)[0]["C"], 1) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 4)[0]["D"], 2) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 7)[0]), 2) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 7)[0].keys(), {"C", "D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 7)[0]["C"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 7)[0]["D"], 1)
"COc1cc(C=Cc2cc(O)cc(OC3OC(CO)C(O)C(O)C3O)c2)ccc1O", "O=c1c(OC2OC(CO)C(O)C(O)C2O)c(-c2ccc(O)cc2)oc2cc(O)cc(O)c12", "CC1OC(OCC2OC(Oc3c(-c4ccc(O)cc4)oc4cc(O)cc(O)c4c3=O)C(O)C(O)C2O)C(O)C(O)C1O", "O=c1cc(-c2ccc(O)c(O)c2)oc2cc(OC3OC(CO)C(O)C(O)C3O)cc(O)c12", "COC(=O)c1cc(O)c(O)c(O)c1", "O=c1c(O)c(-c2cc(O)c(O)c(O)c2)oc2cc(O)cc(O)c12", "O=C1CC(c2ccc(O)cc2)Oc2cc(O)cc(O)c21", "O=C1CC(c2ccc(O)c(O)c2)c2c(cc(O)c3c2OC(c2ccc(O)c(O)c2)C(O)C3)O1", "Oc1ccc(C=Cc2cc(O)cc3c2C(c2cc(O)cc(O)c2)C(c2ccc(O)cc2)O3)cc1", "O=C1CC(c2ccc(O)cc2)Oc2cc(OC3OC(CO)C(O)C(O)C3O)cc(O)c21", "O=C(O)C=Cc1ccccc1O", "COc1cc(-c2oc3cc(O)cc4oc(=O)cc(c2OC2OC(CO)C(O)C(O)C2O)c34)cc(OC)c1O", "Oc1ccc(C2c3c(O)cc(O)cc3C3C(c4ccc(O)cc4)c4c(O)cc(O)cc4C23)cc1", "O=C(CCc1ccc(O)cc1)c1c(O)cc(O)cc1OC1OC(CO)C(O)C(O)C1O", "Oc1cc(O)cc(C=Cc2ccc(O)c(O)c2)c1", "Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2c1c(O)cc(O)c2c1OC(c1ccc(O)c(O)c1)C(O)C2"] # Get kernel matrix from fingerprints without substructure learning fps_mat = CircularFPFeaturizer(fp_mode="count").transform(smis) print("Is instance of 'csr_matrix': %d" % sp.isspmatrix_csr(fps_mat)) print(fps_mat.shape) times = timeit.repeat(lambda: _min_max_sparse_csr(fps_mat, fps_mat, n_jobs=4), number=1, repeat=3) print("min time:", np.min(times)) # Now with substructure learning fps_mat = CircularFPFeaturizer(fp_mode="count", only_freq_subs=True, output_format=True).fit_transform(smis) print("Is instance of 'csr_matrix': %d" % sp.isspmatrix_csr(fps_mat)) print(fps_mat.shape) times = timeit.repeat(lambda: _min_max_dense(fps_mat, fps_mat), number=1, repeat=3) print("min time:", np.min(times))
def test__count_and_filter_hashes(self) -> None: d = [ {"A": 0, "B": 0, "C": 0, "D": 0}, {"A": 0 , "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, { "B": 0, "C": 0, "D": 0}, {"A": 0, "B": 0, "C": 0, "D": 0}, { "D": 0} ] # A = 3 / 8, B = 6 / 8 and C = 7 / 8 # Must appear in at least 101% --> should result in an empty output self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 1.01)[0]), 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 1.01)[0], OrderedDict()) # Must appear in at least 0% self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 0)[0]), 4) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 0)[0].keys(), {"A", "B", "C", "D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 0)[0]["A"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 0)[0]["B"], 1) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 0)[0]["C"], 2) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 0)[0]["D"], 3) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 2 / 8)[0]), 4) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 2 / 8)[0].keys(), {"A", "B", "C", "D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 2 / 8)[0]["A"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 2 / 8)[0]["B"], 1) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 2 / 8)[0]["C"], 2) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 2 / 8)[0]["D"], 3) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 3 / 8)[0]), 4) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 3 / 8)[0].keys(), {"A", "B", "C", "D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 3 / 8)[0]["A"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 3 / 8)[0]["B"], 1) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 3 / 8)[0]["C"], 2) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 3 / 8)[0]["D"], 3) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 4 / 8)[0]), 3) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 4 / 8)[0].keys(), {"B", "C", "D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 4 / 8)[0]["B"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 4 / 8)[0]["C"], 1) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 4 / 8)[0]["D"], 2) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 6 / 8)[0]), 3) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 6 / 8)[0].keys(), {"B", "C", "D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 6 / 8)[0]["B"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 6 / 8)[0]["C"], 1) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 6 / 8)[0]["D"], 2) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 7 / 8)[0]), 2) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 7 / 8)[0].keys(), {"C", "D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 7 / 8)[0]["C"], 0) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 7 / 8)[0]["D"], 1) self.assertEqual(len(CircularFPFeaturizer._count_and_filter_hashes(d, 8 / 8)[0]), 1) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 8 / 8)[0].keys(), {"D"}) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes(d, 8 / 8)[0]["D"], 0) # Empty input self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes({}, 0)[0], OrderedDict()) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes({}, 0.5)[0], OrderedDict()) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes({}, 1)[0], OrderedDict()) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes({}, 0)[1], OrderedDict()) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes({}, 0.5)[1], OrderedDict()) self.assertEqual(CircularFPFeaturizer._count_and_filter_hashes({}, 1)[1], OrderedDict())