def get_convmol_features(protein_pdb_file, ligand_pdb_file, data_dir='./data/pdbbind/v2018'): protein_pdb_file = os.path.join(data_dir, protein_pdb_file) ligand_pdb_file = os.path.join(data_dir, ligand_pdb_file) if not os.path.exists(protein_pdb_file): raise IOError(".pdb file not found in " + protein_pdb_file) if not os.path.exists(ligand_pdb_file): raise IOError(".pdb file not found in " + ligand_pdb_file) (_, _, compl) = get_molecules_from_pdb(protein_pdb_file, ligand_pdb_file) # This is from deepchem.models.graph_models.GraphConv # default generator nodes, adj_list = build_graph_from_molecule(compl) convmol = ConvMol(nodes, adj_list) multiConvMol = convmol.agglomerate_mols([convmol]) (node_feat, deg_slice, membership, deg_adj_list) = \ (multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership), multiConvMol.get_deg_adjacency_lists()) return (node_feat, deg_slice, membership, deg_adj_list)
def test_agglomerate_molecules(self): """Test AggrMol.agglomerate_mols.""" molecules = [] #### First example molecule N_feat = 4 # Artificial feature array. atom_features = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) adj_list = [[1], [0, 2], [1]] molecules.append(ConvMol(atom_features, adj_list)) #### Second example molecule atom_features = np.array([[20, 21, 22, 23], [24, 25, 26, 27], [28, 29, 30, 31], [32, 33, 34, 35]]) adj_list = [[1, 2], [0, 3], [0, 3], [1, 2]] molecules.append(ConvMol(atom_features, adj_list)) ### Third example molecule atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47], [48, 49, 50, 51], [52, 53, 54, 55], [56, 57, 58, 59]]) adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]] molecules.append(ConvMol(atom_features, adj_list)) # Test agglomerate molecule method concat_mol = ConvMol.agglomerate_mols(molecules) assert concat_mol.get_num_atoms() == 12 assert concat_mol.get_num_molecules() == 3 atom_features = concat_mol.get_atom_features() assert np.array_equal(atom_features[0, :], [1, 2, 3, 4]) assert np.array_equal(atom_features[2, :], [56, 57, 58, 59]) assert np.array_equal(atom_features[11, :], [52, 53, 54, 55]) assert np.array_equal(atom_features[4, :], [20, 21, 22, 23]) deg_adj_lists = concat_mol.get_deg_adjacency_lists() # No atoms of degree 0 assert np.array_equal(deg_adj_lists[0], np.zeros([0, 0])) # 3 atoms of degree 1 assert np.array_equal(deg_adj_lists[1], [[3], [3], [11]]) # 8 atoms of degree 2 assert np.array_equal(deg_adj_lists[2], [[0, 1], [5, 6], [4, 7], [4, 7], [5, 6], [9, 10], [8, 11], [8, 11]]) # 1 atom of degree 3 assert np.array_equal(deg_adj_lists[3], [[9, 10, 2]]) # 0 atoms of degree 4 assert np.array_equal(deg_adj_lists[4], np.zeros([0, 4])) # 0 atoms of degree 5 assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5]))
def test_get_adjacency_list(self): """Tests that adj-list is canonicalized properly.""" atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47], [48, 49, 50, 51], [52, 53, 54, 55], [56, 57, 58, 59]]) canon_adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]] mol = ConvMol(atom_features, canon_adj_list) # Sorting is done by atom degree as before. So the ordering goes # 4, 0, 1, 2, 3 now in terms of the original ordering. The mapping # from new position to old position is # {(4, 0), (0, 1), (1, 2), (2, 3), (3, 4)}. Check that adjacency # list respects this reordering and returns correct adjacency list. assert (mol.get_adjacency_list() == [[4], [2, 3], [1, 4], [1, 4], [2, 3, 0]])
def batch_to_feed_dict(self, batch): """Converts the current batch of mol_graphs into tensorflow feed_dict. Assigns the graph information in array of ConvMol objects to the placeholders tensors params ------ batch : np.ndarray Array of ConvMol objects returns ------- feed_dict : dict Can be merged with other feed_dicts for input into tensorflow """ # Merge mol conv objects batch = ConvMol.agglomerate_mols(batch) atoms = batch.get_atom_features() deg_adj_lists = [ batch.deg_adj_lists[deg] for deg in range(1, self.max_deg + 1) ] # Generate dicts deg_adj_dict = dict( list(zip(self.deg_adj_lists_placeholders, deg_adj_lists))) atoms_dict = { self.atom_features_placeholder: atoms, self.deg_slice_placeholder: batch.deg_slice, self.membership_placeholder: batch.membership } return merge_dicts([atoms_dict, deg_adj_dict])
def _featurize(self, mol): """Encodes mol as a ConvMol object.""" # Get the node features idx_nodes = [(a.GetIdx(), np.concatenate((atom_features( a, use_chirality=self.use_chirality), self._get_atom_properties(a)))) for a in mol.GetAtoms()] idx_nodes.sort() # Sort by ind to ensure same order as rd_kit idx, nodes = list(zip(*idx_nodes)) # Stack nodes into an array nodes = np.vstack(nodes) if self.master_atom: master_atom_features = np.expand_dims(np.mean(nodes, axis=0), axis=0) nodes = np.concatenate([nodes, master_atom_features], axis=0) # Get bond lists with reverse edges included edge_list = [ (b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds() ] # Get canonical adjacency list canon_adj_list = [[] for mol_id in range(len(nodes))] for edge in edge_list: canon_adj_list[edge[0]].append(edge[1]) canon_adj_list[edge[1]].append(edge[0]) if self.master_atom: fake_atom_index = len(nodes) - 1 for index in range(len(nodes) - 1): canon_adj_list[index].append(fake_atom_index) return ConvMol(nodes, canon_adj_list)
def default_generator(self, dataset, epochs=1, mode='fit', deterministic=True, pad_batches=True): for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches(batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) multiConvMol = ConvMol.agglomerate_mols(X_b) n_samples = np.array(X_b.shape[0]) if mode == 'predict': dropout = np.array(0.0) else: dropout = np.array(1.0) inputs = [ multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership), n_samples, dropout ] for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): inputs.append(multiConvMol.get_deg_adjacency_lists()[i]) yield (inputs, [y_b], [w_b])
def batch_to_feed_dict(self, batch): """Converts the current batch of mol_graphs into tensorflow feed_dict. Assigns the graph information in array of ConvMol objects to the placeholders tensors params ------ batch : np.ndarray Array of ConvMol objects returns ------- feed_dict : dict Can be merged with other feed_dicts for input into tensorflow """ # Merge mol conv objects batch = ConvMol.agglomerate_mols(batch) atoms = batch.get_atom_features() deg_adj_lists = [batch.deg_adj_lists[deg] for deg in range(1, self.max_deg+1)] # Generate dicts deg_adj_dict = dict(list(zip(self.deg_adj_lists_placeholders, deg_adj_lists))) atoms_dict = {self.atom_features_placeholder : atoms, self.deg_slice_placeholder : batch.deg_slice, self.membership_placeholder : batch.membership} return merge_dicts([atoms_dict, deg_adj_dict])
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): for epoch in range(epochs): if not predict: print('Starting epoch %i' % epoch) for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches( self.batch_size, pad_batches=True, deterministic=deterministic)): d = {} for index, label in enumerate(self.my_labels): if self.mode == 'classification': d[label] = to_one_hot(y_b[:, index]) if self.mode == 'regression': d[label] = np.expand_dims(y_b[:, index], -1) d[self.my_task_weights] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[self.atom_features] = multiConvMol.get_atom_features() d[self.degree_slice] = multiConvMol.deg_slice d[self.membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def test_graph_gather(self): """Test that GraphGather can be invoked.""" batch_size = 2 n_features = 75 n_atoms = 4 # In CCC and C, there are 4 atoms raw_smiles = ['CCC', 'C'] mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mols = featurizer.featurize(mols) multi_mol = ConvMol.agglomerate_mols(mols) atom_features = multi_mol.get_atom_features() degree_slice = multi_mol.deg_slice membership = multi_mol.membership deg_adjs = multi_mol.get_deg_adjacency_lists()[1:] with self.session() as sess: atom_features = tf.convert_to_tensor(atom_features, dtype=tf.float32) degree_slice = tf.convert_to_tensor(degree_slice, dtype=tf.int32) membership = tf.convert_to_tensor(membership, dtype=tf.int32) deg_adjs_tf = [] for deg_adj in deg_adjs: deg_adjs_tf.append(tf.convert_to_tensor(deg_adj, dtype=tf.int32)) args = [atom_features, degree_slice, membership] + deg_adjs_tf out_tensor = GraphGather(batch_size)(*args) sess.run(tf.global_variables_initializer()) out_tensor = out_tensor.eval() # TODO(rbharath): Why is it 2*n_features instead of n_features? assert out_tensor.shape == (batch_size, 2 * n_features)
def test_mol_ordering(): mols = get_molecules() featurizer = ConvMolFeaturizer() featurized_mols = featurizer.featurize(mols) for i in range(len(featurized_mols)): atom_features = featurized_mols[i].atom_features degree_list = np.expand_dims(featurized_mols[i].degree_list, axis=1) atom_features = np.concatenate([degree_list, atom_features], axis=1) featurized_mols[i].atom_features = atom_features conv_mol = ConvMol.agglomerate_mols(featurized_mols) for start, end in conv_mol.deg_slice.tolist(): members = conv_mol.membership[start:end] sorted_members = np.array(sorted(members)) members = np.array(members) assert np.all(sorted_members == members) conv_mol_atom_features = conv_mol.get_atom_features() adj_number = 0 for start, end in conv_mol.deg_slice.tolist(): deg_features = conv_mol_atom_features[start:end] adj_number_array = deg_features[:, 0] assert np.all(adj_number_array == adj_number) adj_number += 1
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(self.batch_size, pad_batches=pad_batches, deterministic=deterministic)): d = {} if self.mode == 'classification': d[self.labels[0]] = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) else: d[self.labels[0]] = y_b d[self.task_weights[0]] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[self.atom_features] = multiConvMol.get_atom_features() d[self.degree_slice] = multiConvMol.deg_slice d[self.membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[self.deg_adjs[ i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def test_construct_conv_mol(self): """Tests that ConvMols can be constructed without crash.""" N_feat = 4 # Artificial feature array. atom_features = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) adj_list = [[1], [0, 2], [1]] mol = ConvMol(atom_features, adj_list)
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches( self.batch_size, pad_batches=pad_batches, deterministic=deterministic)): d = {} if self.mode == 'classification': d[self.labels[0]] = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) else: d[self.labels[0]] = y_b d[self.task_weights[0]] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[self.atom_features] = multiConvMol.get_atom_features() d[self.degree_slice] = multiConvMol.deg_slice d[self.membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def test_graph_gather(self): """Test that GraphGather can be invoked.""" batch_size = 2 n_features = 75 n_atoms = 4 # In CCC and C, there are 4 atoms raw_smiles = ['CCC', 'C'] mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mols = featurizer.featurize(mols) multi_mol = ConvMol.agglomerate_mols(mols) atom_features = multi_mol.get_atom_features() degree_slice = multi_mol.deg_slice membership = multi_mol.membership deg_adjs = multi_mol.get_deg_adjacency_lists()[1:] with self.session() as sess: atom_features = tf.convert_to_tensor(atom_features, dtype=tf.float32) degree_slice = tf.convert_to_tensor(degree_slice, dtype=tf.int32) membership = tf.convert_to_tensor(membership, dtype=tf.int32) deg_adjs_tf = [] for deg_adj in deg_adjs: deg_adjs_tf.append( tf.convert_to_tensor(deg_adj, dtype=tf.int32)) args = [atom_features, degree_slice, membership] + deg_adjs_tf out_tensor = GraphGather(batch_size)(*args) sess.run(tf.global_variables_initializer()) out_tensor = out_tensor.eval() # TODO(rbharath): Why is it 2*n_features instead of n_features? assert out_tensor.shape == (batch_size, 2 * n_features)
def test_conv_mol_deg_slice(self): """Tests that deg_slice works properly.""" atom_features = np.array([[20, 21, 22, 23], [24, 25, 26, 27], [28, 29, 30, 31], [32, 33, 34, 35]]) adj_list = [[1, 2], [0, 3], [0, 3], [1, 2]] mol = ConvMol(atom_features, adj_list) assert np.array_equal( mol.get_deg_slice(), # 0 atoms of degree 0 # 0 atoms of degree 1 # 4 atoms of degree 2 # 0 atoms of degree 3 # 0 atoms of degree 4 # 0 atoms of degree 5 # 0 atoms of degree 6 np.array([[0, 0], [0, 0], [0, 4], [0, 0], [0, 0], [0, 0], [0, 0]]))
def test_get_atom_features(self): """Test that the atom features are computed properly.""" atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47], [48, 49, 50, 51], [52, 53, 54, 55], [56, 57, 58, 59]]) canon_adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]] mol = ConvMol(atom_features, canon_adj_list) # atom 4 has 0 neighbors # atom 0 has 2 neighbors # atom 1 has 2 neighbors # atom 2 has 2 neighbors # atom 3 has 3 neighbors. # Verify that atom features have been sorted by atom degree. assert np.array_equal( mol.get_atom_features(), np.array([[56, 57, 58, 59], [40, 41, 42, 43], [44, 45, 46, 47], [48, 49, 50, 51], [52, 53, 54, 55]]))
def test_get_atom_features(self): """Test that the atom features are computed properly.""" atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47], [48, 49, 50, 51], [52, 53, 54, 55], [56, 57, 58, 59]]) canon_adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]] mol = ConvMol(atom_features, canon_adj_list) # atom 4 has 0 neighbors # atom 0 has 2 neighbors # atom 1 has 2 neighbors # atom 2 has 2 neighbors # atom 3 has 3 neighbors. # Verify that atom features have been sorted by atom degree. assert np.array_equal(mol.get_atom_features(), np.array([[56, 57, 58, 59], [40, 41, 42, 43], [44, 45, 46, 47], [48, 49, 50, 51], [52, 53, 54, 55]]))
def test_load_pretrained_subclassed_model(self): from rdkit import Chem bi_tasks = ['a', 'b'] y = np.ones((3, 2)) smiles = ['C', 'CC', 'CCC'] mols = [Chem.MolFromSmiles(smile) for smile in smiles] featurizer = dc.feat.ConvMolFeaturizer() X = featurizer.featurize(mols) dataset = dc.data.NumpyDataset(X, y, ids=smiles) source_model = dc.models.GraphConvModel( n_tasks=len(bi_tasks), graph_conv_layers=[128, 128], dense_layer_size=512, dropout=0, mode='regression', learning_rate=0.001, batch_size=8, model_dir="model") source_model.fit(dataset) dest_model = dc.models.GraphConvModel( n_tasks=len(bi_tasks), graph_conv_layers=[128, 128], dense_layer_size=512, dropout=0, mode='regression', learning_rate=0.001, batch_size=8) X_b, y_b, w_b, ids_b = next( dataset.iterbatches(batch_size=8, deterministic=True, pad_batches=True)) multiConvMol = ConvMol.agglomerate_mols(X_b) n_samples = np.array(X_b.shape[0]) inputs = [ multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership), n_samples ] for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): inputs.append(multiConvMol.get_deg_adjacency_lists()[i]) dest_model.load_from_pretrained( source_model=source_model, assignment_map=None, value_map=None, include_top=False, inputs=inputs) source_vars = source_model.model.trainable_variables[:-2] dest_vars = dest_model.model.trainable_variables[:-2] assert len(source_vars) == len(dest_vars) for source_var, dest_var in zip(*(source_vars, dest_vars)): source_val = source_var.numpy() dest_val = dest_var.numpy() np.testing.assert_array_almost_equal(source_val, dest_val)
def test_conv_mol_deg_slice(self): """Tests that deg_slice works properly.""" atom_features = np.array([[20, 21, 22, 23], [24, 25, 26, 27], [28, 29, 30, 31], [32, 33, 34, 35]]) adj_list = [[1, 2], [0, 3], [0, 3], [1, 2]] mol = ConvMol(atom_features, adj_list) assert np.array_equal( mol.get_deg_slice(), # 0 atoms of degree 0 # 0 atoms of degree 1 # 4 atoms of degree 2 # 0 atoms of degree 3 # 0 atoms of degree 4 # 0 atoms of degree 5 # 0 atoms of degree 6 np.array([[0, 0], [0, 0], [0, 4], [0, 0], [0, 0], [0, 0], [0,0]]))
def data_generator(self, dataset, batch_size:int, epochs=1): for e in range(epochs): for X, y, w, idx in dataset.iterbatches(batch_size, pad_batches=True, deterministic=True): feed_dict = {self.label: to_one_hot(y[:, 0]), self.weight: w} # data for feed ConvMolList = ConvMol.agglomerate_mols(X) feed_dict[self.atom_features] = ConvMolList.get_atom_features() feed_dict[self.indexing] = ConvMolList.deg_slice feed_dict[self.membership] = ConvMolList.membership deg_adj_list = ConvMolList.get_deg_adjacency_lists() for i in range(1, len(deg_adj_list)): feed_dict[self.deg_adj_list[i - 1]] = deg_adj_list[i] yield feed_dict
def test_agglomerate_molecules(self): """Test AggrMol.agglomerate_mols.""" molecules = [] #### First example molecule N_feat = 4 # Artificial feature array. atom_features = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) adj_list = [[1], [0, 2], [1]] molecules.append(ConvMol(atom_features, adj_list)) #### Second example molecule atom_features = np.array([[20, 21, 22, 23], [24, 25, 26, 27], [28, 29, 30, 31], [32, 33, 34, 35]]) adj_list = [[1, 2], [0, 3], [0, 3], [1, 2]] molecules.append(ConvMol(atom_features, adj_list)) ### Third example molecule atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47], [48, 49, 50, 51], [52, 53, 54, 55], [56, 57, 58, 59]]) adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]] molecules.append(ConvMol(atom_features, adj_list)) # Test agglomerate molecule method concat_mol = ConvMol.agglomerate_mols(molecules) assert concat_mol.get_num_atoms() == 12 assert concat_mol.get_num_molecules() == 3 atom_features = concat_mol.get_atom_features() assert np.array_equal(atom_features[0, :], [1, 2, 3, 4]) assert np.array_equal(atom_features[2, :], [56, 57, 58, 59]) assert np.array_equal(atom_features[11, :], [52, 53, 54, 55]) assert np.array_equal(atom_features[4, :], [20, 21, 22, 23]) deg_adj_lists = concat_mol.get_deg_adjacency_lists() # No atoms of degree 0 assert np.array_equal(deg_adj_lists[0], np.zeros([0, 0])) # 3 atoms of degree 1 assert np.array_equal(deg_adj_lists[1], [[3], [3], [11]]) # 8 atoms of degree 2 assert np.array_equal( deg_adj_lists[2], [[0, 1], [5, 6], [4, 7], [4, 7], [5, 6], [9, 10], [8, 11], [8, 11]]) # 1 atom of degree 3 assert np.array_equal(deg_adj_lists[3], [[9, 10, 2]]) # 0 atoms of degree 4 assert np.array_equal(deg_adj_lists[4], np.zeros([0, 4])) # 0 atoms of degree 5 assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5]))
def test_null_conv_mol(self): """Running Null AggrMol Test. Only works when max_deg=6 and min_deg=0""" num_feat = 4 null_mol = ConvMol.get_null_mol(num_feat) deg_adj_lists = null_mol.get_deg_adjacency_lists() # Check that atoms are only connected to themselves. assert np.array_equal(deg_adj_lists[10], [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]]) assert np.array_equal(deg_adj_lists[1], [[1]]) # Check that there's one atom of each degree. assert np.array_equal(null_mol.get_deg_slice(), [[0, 1], [1, 1], [2, 1], [3, 1], [4, 1], [5, 1], [6, 1], [7, 1], [8, 1], [9, 1], [10, 1]])
def feed_dict_generator(dataset, batch_size, epochs=1): for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(batch_size, pad_batches=True)): d = {} for index, label in enumerate(labels): d[label] = to_one_hot(y_b[:, index]) d[task_weights] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[atom_features] = multiConvMol.get_atom_features() d[degree_slice] = multiConvMol.deg_slice d[membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def _construct_feed_dict(self, X_b, y_b, w_b, ids_b): feed_dict = dict() if y_b is not None: for index, label in enumerate(self.labels): feed_dict[label.out_tensor] = to_one_hot(y_b[:, index]) if self.task_weights is not None and w_b is not None: feed_dict[self.task_weights[0].out_tensor] = w_b if self.features is not None: multiConvMol = ConvMol.agglomerate_mols(X_b) feed_dict[self.features[0].out_tensor] = multiConvMol.get_atom_features() feed_dict[self.features[1].out_tensor] = multiConvMol.deg_slice feed_dict[self.features[2].out_tensor] = multiConvMol.membership for i in range(self.max_degree): feed_dict[self.features[i + 3] .out_tensor] = multiConvMol.get_deg_adjacency_lists()[i + 1] return feed_dict
def test_null_conv_mol(self): """Running Null AggrMol Test. Only works when max_deg=6 and min_deg=0""" num_feat = 4 min_deg = 0 null_mol = ConvMol.get_null_mol(num_feat) deg_adj_lists = null_mol.get_deg_adjacency_lists() # Check that atoms are only connected to themselves. assert np.array_equal(deg_adj_lists[10], [[10, 10, 10, 10, 10, 10, 10, 10, 10, 10]]) assert np.array_equal(deg_adj_lists[1], [[1]]) # Check that there's one atom of each degree. assert np.array_equal(null_mol.get_deg_slice(), [[0, 1], [1, 1], [2, 1], [3, 1], [4, 1], [5, 1], [6, 1], [7, 1], [8, 1], [9, 1], [10, 1]])
def data_generator(dataset, n_epoch=1, predict=False): for ee in range(n_epoch): if not predict: print('Starting epoch %i' % ee) for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(n_batch, pad_batches=True, deterministic=True)): fd = {} for ts, label_t in enumerate(label15): fd[label_t] = to_one_hot(y_b[:, ts]) mol = ConvMol.agglomerate_mols(X_b) fd[atom_features] = mol.get_atom_features() fd[degree_slice] = mol.deg_slice fd[membership] = mol.membership deg_adj_list = mol.get_deg_adjacency_lists() for ii in range(1, 11): fd[deg_adjs[ii - 1]] = deg_adj_list[ii] yield fd
def data_generator(dataset, epochs=1, predict=False, pad_batches=True): for epoch in range(epochs): if not predict: print('Starting epoch %i' % epoch) for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(batch_size, pad_batches=pad_batches, deterministic=True)): d = {} for index, label in enumerate(labels): d[label] = to_one_hot(y_b[:, index]) d[weights] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[atom_features] = multiConvMol.get_atom_features() d[degree_slice] = multiConvMol.deg_slice d[membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def _featurize(self, mol): """Encodes mol as a ConvMol object.""" # Get the node features idx_nodes = [(a.GetIdx(), atom_features(a)) for a in mol.GetAtoms()] idx_nodes.sort() # Sort by ind to ensure same order as rd_kit idx, nodes = list(zip(*idx_nodes)) # Stack nodes into an array nodes = np.vstack(nodes) # Get bond lists with reverse edges included edge_list = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()] # Get canonical adjacency list canon_adj_list = [[] for mol_id in range(len(nodes))] for edge in edge_list: canon_adj_list[edge[0]].append(edge[1]) canon_adj_list[edge[1]].append(edge[0]) return ConvMol(nodes, canon_adj_list)
def amino_acid_embedding(self, name=None): if name == None: name = 'AAEmbedding_'+str(self.module_count)+'_' self.module_count += 1 feat = ConvMolFeaturizer() featurized_AA = [feat._featurize(Chem.MolFromSmiles(smile)) for smile in AminoAcid_SMILES] multiConvMol = ConvMol.agglomerate_mols(featurized_AA, max_deg=3) atom_features = TensorWrapper(tf.constant(multiConvMol.get_atom_features(), dtype=tf.float32), name=name+'atom_features') degree_slice = TensorWrapper(tf.constant(multiConvMol.deg_slice, dtype=tf.int32), name=name+'degree') membership = TensorWrapper(tf.constant(multiConvMol.membership, dtype=tf.int32), name=name+'membership') deg_adjs = [] for i in range(0, 3): deg_adjs.append(TensorWrapper(tf.constant(multiConvMol.get_deg_adjacency_lists()[i+1], dtype=tf.int32), name=name+'deg_'+str(i))) gc1 = GraphConv( 64, max_deg=3, activation_fn=tf.nn.relu, in_layers=[atom_features, degree_slice, membership] + deg_adjs, name=name+'gc1') batch_norm1 = BatchNorm(in_layers=[gc1, self.training_placeholder], name=name+'bn1') gp1 = GraphPool(max_degree=3, in_layers=[batch_norm1, degree_slice, membership] + deg_adjs, name=name+'gp1') gc2 = GraphConv( 64, max_deg=3, activation_fn=tf.nn.relu, in_layers=[gp1, degree_slice, membership] + deg_adjs, name=name+'gc2') batch_norm2 = BatchNorm(in_layers=[gc2, self.training_placeholder], name=name+'bn2') gp2 = GraphPool(max_degree=3, in_layers=[batch_norm2, degree_slice, membership] + deg_adjs, name=name+'gp2') dense = Dense(out_channels=self.embedding_length/2, activation_fn=tf.nn.relu, in_layers=[gp2], name=name+'dense1') batch_norm3 = BatchNorm(in_layers=[dense, self.training_placeholder], name=name+'bn3') readout = GraphGather( batch_size=21, activation_fn=tf.nn.tanh, in_layers=[batch_norm3, degree_slice, membership] + deg_adjs, name=name+'gg') padding = AminoAcidPad( embedding_length=self.embedding_length, in_layers=[readout], name=name+'pad') return padding
def data_generator(dataset, predict=False, pad_batches=True): # iterbatches: Get an object that iterates over minibatches from the dataset. for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(batch_size, pad_batches=pad_batches, deterministic=True)): # Concatenates list of ConvMol’s into one mol object that # can be used to feed into tensorflow placeholders. # agglomerate_mols -> mols: ConvMol objects to be combined into one molecule. multiConvMol = ConvMol.agglomerate_mols(X_b) # get_atom_features: Returns canonicalized version of atom features inputs = [ multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership) ] # Returns adjacency lists grouped by atom degree. for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): inputs.append(multiConvMol.get_deg_adjacency_lists()[i]) labels = [to_one_hot(y_b.flatten(), 2).reshape(-1, n_tasks, 2)] weights = [w_b] yield (inputs, labels, weights)
def _featurize(self, mol): """Encodes mol as a ConvMol object. If per_atom_fragmentation is True, then for each molecule a list of ConvMolObjects will be created""" def per_atom(n, a): """ Enumerates fragments resulting from mol object, s.t. each fragment = mol with single atom removed (all possible removals are enumerated) Goes over nodes, deletes one at a time and updates adjacency list of lists (removes connections to that node) Parameters ---------- n: np.array of nodes (number_of_nodes X number_of_features) a: list of nested lists of adjacent node pairs """ for i in range(n.shape[0]): new_n = np.delete(n, (i), axis=0) new_a = [] for j, node_pair in enumerate(a): if i != j: # don't need this pair, no more connections to deleted node tmp_node_pair = [] for v in node_pair: if v < i: tmp_node_pair.append(v) elif v > i: tmp_node_pair.append( v - 1 ) # renumber node, because of offset after node deletion new_a.append(tmp_node_pair) yield new_n, new_a # Get the node features idx_nodes = [(a.GetIdx(), np.concatenate( (atom_features(a, use_chirality=self.use_chirality), self._get_atom_properties(a)))) for a in mol.GetAtoms()] idx_nodes.sort() # Sort by ind to ensure same order as rd_kit idx, nodes = list(zip(*idx_nodes)) # Stack nodes into an array nodes = np.vstack(nodes) if self.master_atom: master_atom_features = np.expand_dims(np.mean(nodes, axis=0), axis=0) nodes = np.concatenate([nodes, master_atom_features], axis=0) # Get bond lists with reverse edges included edge_list = [(b.GetBeginAtomIdx(), b.GetEndAtomIdx()) for b in mol.GetBonds()] # Get canonical adjacency list canon_adj_list = [[] for mol_id in range(len(nodes))] for edge in edge_list: canon_adj_list[edge[0]].append(edge[1]) canon_adj_list[edge[1]].append(edge[0]) if self.master_atom: fake_atom_index = len(nodes) - 1 for index in range(len(nodes) - 1): canon_adj_list[index].append(fake_atom_index) if not self.per_atom_fragmentation: return ConvMol(nodes, canon_adj_list) else: return [ConvMol(n, a) for n, a in per_atom(nodes, canon_adj_list)]
def data_generator(self, dataset, prior_label , task = None, num_prior = 0\ , epochs=1, pad_batches=True): """Data generator for training and evaluation""" for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches( self.batch_size, pad_batches=pad_batches, deterministic=True)): d = {} for index, label in enumerate(self.labels): d[label] = to_one_hot(y_b[:, index]) #if epochs < 12: w_b = w_b*(0.1) + (w_b*y_b*self.scaled_w)/10.0 multiConvMol = ConvMol.agglomerate_mols(X_b[:,0]) circular_feat = X_b[:,1:] d[self.circular_feat] = circular_feat """Encode labels into the atom_features""" if prior_label: prior = [] if task is None: for e in range(self.batch_size): arr = np.zeros(self.num_task * 2) if random.random() < 0.5: index = random.sample(range(self.num_task), \ random.randint(0, self.num_task -1)) else: index = [] if len(index) != 0: for sth in index: if w_b[e,sth] != 0: if y_b[e,sth] == 1: arr[2*sth+1] = 1 else: arr[2*sth] = 1 w_b[e,sth] = w_b[e,sth]*0.001 prior.append(arr) else: for e in range(self.batch_size): arr = np.zeros(self.num_task * 2) list_t = list(range(self.num_task)) list_t.pop(task) index = random.sample(list_t, num_prior) if len(index) != 0: for sth in index: if w_b[e,sth] != 0: if y_b[e,sth] == 1: arr[2*sth+1] = 1 else: arr[2*sth] = 1 w_b[e,sth] = w_b[e,sth]*0.001 prior.append(arr) w_b[e,task] = 1.0 arr[2*task] = 0. arr[2*task+1] = 0. prior = np.array(prior) atom_feat = multiConvMol.get_atom_features() member = multiConvMol.membership new_atom_feats = [] for i in range(atom_feat.shape[0]): new_atom_feat = np.concatenate((atom_feat[i],\ prior[member[i]])) new_atom_feats.append(new_atom_feat) new_atom_feats = np.array(new_atom_feats) d[self.atom_features] = new_atom_feats else: d[self.atom_features] = multiConvMol.get_atom_features() d[self.weights] = w_b d[self.degree_slice] = multiConvMol.deg_slice d[self.membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d