def predict(self, X): y_preds = [] X_copy = X.copy() X_copy.index = range(len(X_copy)) ecfp_transformer = ECFPEncoder(radius=self.radius, dim=self.ecfp_dim, sparse_output=True) sparse_ecfp = ecfp_transformer.transform(X_copy) for i, row in X_copy.iterrows(): # If the target is known target_id = row['target_id'] if target_id in self.store: known_ecfps = self.store[target_id][0] sim = cosine_similarity(known_ecfps, sparse_ecfp[i])[:, 0] sorted_indexes = np.argsort(sim, axis=0) ys = self.store[target_id][1][sorted_indexes[-self.k:]] if self.weights == 'uniform': y_preds.append(np.mean(ys)) elif self.weights == 'average': y_preds.append( np.average(ys, weights=sim[sorted_indexes[-self.k:]])) else: # If the target is unknown ecfp = ecfp_transformer.transform(row.to_frame().T) sim = cosine_similarity(ecfp, self.full_ecfp)[0] y_preds.append(self.full_y[np.argmax(sim)]) return np.array(y_preds).reshape((len(y_preds), 1))
def fit(self, X, y): y = y.reshape(-1) X.index = range(len(X)) self.store = {} ecfp_transformer = ECFPEncoder(radius=self.radius, dim=self.ecfp_dim, sparse_output=True) for target_id, group in X.groupby('target_id'): self.store[target_id] = (ecfp_transformer.fit_transform(group), y[group.index]) self.full_ecfp = ecfp_transformer.transform(X) self.full_y = y return self
class TestECFPEncoder(unittest.TestCase): transformer = ECFPEncoder(radius=4) def get_X(self): return pd.DataFrame([ ["InChI=1S/CO2/c2-1-3"], [ "InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-10(12)13/h2-6,11H,1H3,(H,12,13)/b5-3+" ] ], columns=['standard_inchi']) def test_transform(self): X = self.get_X() X_transformed = self.transformer.fit_transform(X) pd.testing.assert_frame_equal( X_transformed, pd.DataFrame([ ["InChI=1S/CO2/c2-1-3", [633848, 899457, 899746, 916106]], [ "InChI=1S/C10H10O4/c1-14-9-6-7(2-4-8(9)11)3-5-10(12)13/h2-6,11H,1H3,(H,12,13)/b5-3+", [ 1773, 9728, 20034, 57369, 57588, 78979, 88049, 95516, 107971, 123721, 134214, 167638, 204359, 349540, 356383, 378749, 390288, 397092, 431546, 435051, 439248, 459409, 495384, 515018, 528633, 529834, 547430, 614225, 624875, 635687, 647863, 650023, 650051, 654006, 678945, 726962, 830972, 846213, 874176, 911985, 916106, 923641, 942272 ] ] ], columns=['standard_inchi', 'ecfp_encoding'])) def test_transform_with_sparse_output(self): X = self.get_X() transformer = ECFPEncoder(radius=4, sparse_output=True) Xt = transformer.fit_transform(X) expected_nonzeros = (np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ], dtype=np.int32), np.array([ 633848, 899457, 899746, 916106, 1773, 9728, 20034, 57369, 57588, 78979, 88049, 95516, 107971, 123721, 134214, 167638, 204359, 349540, 356383, 378749, 390288, 397092, 431546, 435051, 439248, 459409, 495384, 515018, 528633, 529834, 547430, 614225, 624875, 635687, 647863, 650023, 650051, 654006, 678945, 726962, 830972, 846213, 874176, 911985, 916106, 923641, 942272 ], dtype=np.int32)) for i, elem in enumerate(Xt.nonzero()): np.testing.assert_array_equal(expected_nonzeros[i], elem)
def get_steps(self, kmer_size=3, radius=2, ecfp_dim=2**10, embedding_dim=10, lr=0.1, max_epochs=5, device=None, train_split=None, optimizer=SGD, weight_decay=0, dropout=0): """ This pipeline is a neural net baseline using sparsed input fingerprints for both the compound (ecfp) and the enzyme (k-mers). :param kmer_size: The k-mer size used for the enzyme's descriptor :param radius: The radius used in ecfp :param ecfp_dim: The dimension of the byte space used by ecfp algorithm :param embedding_dim: Both enzyme and compounds are embedded in the neural net in a space of the same size :param lr: the neural net base learning rate :param max_epochs: Maximum number of epochs to run :param device: The device on which computation will take place :param train_split: if None, no internal cross validation is made, else a skorch CVSplit object :return: sklearn.pipeline.Pipeline """ kmers_counter = KmersCounter(kmer_size=kmer_size) num_kmers = NB_AMINO_ACID**kmer_size collate_fn = partial(collate_to_sparse_tensors, protein_input_size=num_kmers, compound_input_size=ecfp_dim, device=torch.device(device)) net = NeuralNetRegressor(module=Baseline, module__num_kmers=num_kmers, module__num_fingerprints=ecfp_dim, module__embedding_dim=embedding_dim, module__dropout=dropout, max_epochs=max_epochs, lr=lr, optimizer=optimizer, optimizer__weight_decay=weight_decay, device=device, iterator_train__collate_fn=collate_fn, iterator_train__shuffle=True, iterator_valid__collate_fn=collate_fn, train_split=train_split) return [('encode_proteins', kmers_counter), ('encode_ecfp', ECFPEncoder(radius=radius, dim=ecfp_dim)), ('to_dict', DfToDict({ 'protein_input': 'kmers_counts', 'compound_input': 'ecfp_encoding' })), ('baseline_net', net)]
def test_transform_with_sparse_output(self): X = self.get_X() transformer = ECFPEncoder(radius=4, sparse_output=True) Xt = transformer.fit_transform(X) expected_nonzeros = (np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ], dtype=np.int32), np.array([ 633848, 899457, 899746, 916106, 1773, 9728, 20034, 57369, 57588, 78979, 88049, 95516, 107971, 123721, 134214, 167638, 204359, 349540, 356383, 378749, 390288, 397092, 431546, 435051, 439248, 459409, 495384, 515018, 528633, 529834, 547430, 614225, 624875, 635687, 647863, 650023, 650051, 654006, 678945, 726962, 830972, 846213, 874176, 911985, 916106, 923641, 942272 ], dtype=np.int32)) for i, elem in enumerate(Xt.nonzero()): np.testing.assert_array_equal(expected_nonzeros[i], elem)
def get_steps(self, kmer_size=3, radius=2, ecfp_dim=2**10, alpha=0, device=None): return [('sparse_encoding', FeatureUnion(n_jobs=-1, transformer_list=[ ('encode_proteins', KmersCounter(kmer_size=kmer_size, sparse_output=True)), ('encode_ecfp', ECFPEncoder(radius=radius, dim=ecfp_dim, sparse_output=True)) ])), ('linear_regression', Ridge(alpha=alpha))]
def get_steps(self, kmer_size=3, radius=2, ecfp_dim=2**20, hidden_size=10, mlp_sizes=(10, ), embedding_dim=10, max_epochs=10, lr=1, optimizer=SGD, device=None, train_split=None, weight_decay=0, lstm_dropout=0): """ :param kmer_size: :param radius: :param ecfp_dim: :param hidden_size: :param mlp_sizes: :param embedding_dim: :param max_epochs: :param lr: :param optimizer: :param device: :param train_split: :param weight_decay: :param lstm_dropout: :return: """ collate_fn = partial(collate_bilstm_fingerprint, device=torch.device(device), ecfp_dim=ecfp_dim) kmers_encoder = KmerEncoder(kmer_size=kmer_size, pad=True) net = NeuralNetRegressor(module=SiameseBiLSTMFingerprints, module__num_kmers=kmers_encoder.dim + 1, module__num_fingerprints=ecfp_dim, module__embedding_dim=embedding_dim, module__hidden_size=hidden_size, module__mlp_sizes=mlp_sizes, module__lstm_dropout=lstm_dropout, max_epochs=max_epochs, lr=lr, optimizer=optimizer, optimizer__weight_decay=weight_decay, device=device, iterator_train__shuffle=True, iterator_train__collate_fn=collate_fn, iterator_valid__collate_fn=collate_fn, train_split=train_split) return [('encode_proteins', kmers_encoder), ('encode_ecfp', ECFPEncoder(radius=radius, dim=ecfp_dim, sparse_output=False)), ('to_dict', DfToDict({ 'protein_input': 'kmers_encoding', 'compound_input': 'ecfp_encoding', 'protein_lengths': 'encoding_len' })), ('bilstm_fingerprint', net)]