def batch_to_feed_dict(self, batch): """Converts the current batch of mol_graphs into tensorflow feed_dict. Assigns the graph information in array of ConvMol objects to the placeholders tensors params ------ batch : np.ndarray Array of ConvMol objects returns ------- feed_dict : dict Can be merged with other feed_dicts for input into tensorflow """ # Merge mol conv objects batch = ConvMol.agglomerate_mols(batch) atoms = batch.get_atom_features() deg_adj_lists = [batch.deg_adj_lists[deg] for deg in range(1, self.max_deg+1)] # Generate dicts deg_adj_dict = dict(list(zip(self.deg_adj_lists_placeholders, deg_adj_lists))) atoms_dict = {self.atom_features_placeholder : atoms, self.deg_slice_placeholder : batch.deg_slice, self.membership_placeholder : batch.membership} return merge_dicts([atoms_dict, deg_adj_dict])
def test_graph_gather(self): """Test that GraphGather can be invoked.""" batch_size = 2 n_features = 75 n_atoms = 4 # In CCC and C, there are 4 atoms raw_smiles = ['CCC', 'C'] mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mols = featurizer.featurize(mols) multi_mol = ConvMol.agglomerate_mols(mols) atom_features = multi_mol.get_atom_features() degree_slice = multi_mol.deg_slice membership = multi_mol.membership deg_adjs = multi_mol.get_deg_adjacency_lists()[1:] with self.session() as sess: atom_features = tf.convert_to_tensor(atom_features, dtype=tf.float32) degree_slice = tf.convert_to_tensor(degree_slice, dtype=tf.int32) membership = tf.convert_to_tensor(membership, dtype=tf.int32) deg_adjs_tf = [] for deg_adj in deg_adjs: deg_adjs_tf.append(tf.convert_to_tensor(deg_adj, dtype=tf.int32)) args = [atom_features, degree_slice, membership] + deg_adjs_tf out_tensor = GraphGather(batch_size)(*args) sess.run(tf.global_variables_initializer()) out_tensor = out_tensor.eval() # TODO(rbharath): Why is it 2*n_features instead of n_features? assert out_tensor.shape == (batch_size, 2 * n_features)
def default_generator(self, dataset, epochs=1, predict=False, deterministic=True, pad_batches=True): for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches( self.batch_size, pad_batches=pad_batches, deterministic=deterministic)): d = {} if self.mode == 'classification': d[self.labels[0]] = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) else: d[self.labels[0]] = y_b d[self.task_weights[0]] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[self.atom_features] = multiConvMol.get_atom_features() d[self.degree_slice] = multiConvMol.deg_slice d[self.membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def test_agglomerate_molecules(self): """Test AggrMol.agglomerate_mols.""" molecules = [] #### First example molecule N_feat = 4 # Artificial feature array. atom_features = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) adj_list = [[1], [0, 2], [1]] molecules.append(ConvMol(atom_features, adj_list)) #### Second example molecule atom_features = np.array([[20, 21, 22, 23], [24, 25, 26, 27], [28, 29, 30, 31], [32, 33, 34, 35]]) adj_list = [[1, 2], [0, 3], [0, 3], [1, 2]] molecules.append(ConvMol(atom_features, adj_list)) ### Third example molecule atom_features = np.array([[40, 41, 42, 43], [44, 45, 46, 47], [48, 49, 50, 51], [52, 53, 54, 55], [56, 57, 58, 59]]) adj_list = [[1, 2], [0, 3], [0, 3], [1, 2, 4], [3]] molecules.append(ConvMol(atom_features, adj_list)) # Test agglomerate molecule method concat_mol = ConvMol.agglomerate_mols(molecules) assert concat_mol.get_num_atoms() == 12 assert concat_mol.get_num_molecules() == 3 atom_features = concat_mol.get_atom_features() assert np.array_equal(atom_features[0, :], [1, 2, 3, 4]) assert np.array_equal(atom_features[2, :], [56, 57, 58, 59]) assert np.array_equal(atom_features[11, :], [52, 53, 54, 55]) assert np.array_equal(atom_features[4, :], [20, 21, 22, 23]) deg_adj_lists = concat_mol.get_deg_adjacency_lists() # No atoms of degree 0 assert np.array_equal(deg_adj_lists[0], np.zeros([0, 0])) # 3 atoms of degree 1 assert np.array_equal(deg_adj_lists[1], [[3], [3], [11]]) # 8 atoms of degree 2 assert np.array_equal( deg_adj_lists[2], [[0, 1], [5, 6], [4, 7], [4, 7], [5, 6], [9, 10], [8, 11], [8, 11]]) # 1 atom of degree 3 assert np.array_equal(deg_adj_lists[3], [[9, 10, 2]]) # 0 atoms of degree 4 assert np.array_equal(deg_adj_lists[4], np.zeros([0, 4])) # 0 atoms of degree 5 assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5]))
def feed_dict_generator(dataset, batch_size, epochs=1): for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(batch_size, pad_batches=True)): d = {} for index, label in enumerate(labels): d[label] = to_one_hot(y_b[:, index]) d[task_weights] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[atom_features] = multiConvMol.get_atom_features() d[degree_slice] = multiConvMol.deg_slice d[membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def _construct_feed_dict(self, X_b, y_b, w_b, ids_b): feed_dict = dict() if y_b is not None: for index, label in enumerate(self.labels): feed_dict[label.out_tensor] = to_one_hot(y_b[:, index]) if self.task_weights is not None and w_b is not None: feed_dict[self.task_weights[0].out_tensor] = w_b if self.features is not None: multiConvMol = ConvMol.agglomerate_mols(X_b) feed_dict[self.features[0].out_tensor] = multiConvMol.get_atom_features() feed_dict[self.features[1].out_tensor] = multiConvMol.deg_slice feed_dict[self.features[2].out_tensor] = multiConvMol.membership for i in range(self.max_degree): feed_dict[self.features[i + 3] .out_tensor] = multiConvMol.get_deg_adjacency_lists()[i + 1] return feed_dict
def data_generator(dataset, epochs=1, predict=False, pad_batches=True): for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(batch_size, pad_batches=pad_batches, deterministic=True)): d = {} for index, label in enumerate(labels): d[label] = to_one_hot(y_b[:, index]) d[weights] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[atom_features] = multiConvMol.get_atom_features() d[degree_slice] = multiConvMol.deg_slice d[membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def _construct_feed_dict(self, X_b, y_b, w_b, ids_b): feed_dict = dict() if y_b is not None: for index, label in enumerate(self.labels): feed_dict[label.out_tensor] = to_one_hot(y_b[:, index]) if self.task_weights is not None and w_b is not None: feed_dict[self.task_weights[0].out_tensor] = w_b if self.features is not None: multiConvMol = ConvMol.agglomerate_mols(X_b) feed_dict[self.features[0]. out_tensor] = multiConvMol.get_atom_features() feed_dict[self.features[1].out_tensor] = multiConvMol.deg_slice feed_dict[self.features[2].out_tensor] = multiConvMol.membership for i in range(self.max_degree): feed_dict[self.features[ i + 3].out_tensor] = multiConvMol.get_deg_adjacency_lists()[i + 1] return feed_dict
def data_generator(dataset, n_epoch=1, predict=False): for ee in range(n_epoch): if not predict: print('Starting epoch %i' % ee) for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(n_batch, pad_batches=True, deterministic=True)): fd = {} for ts, label_t in enumerate(label15): fd[label_t] = to_one_hot(y_b[:, ts]) mol = ConvMol.agglomerate_mols(X_b) fd[atom_features] = mol.get_atom_features() fd[degree_slice] = mol.deg_slice fd[membership] = mol.membership deg_adj_list = mol.get_deg_adjacency_lists() for ii in range(1, 11): fd[deg_adjs[ii - 1]] = deg_adj_list[ii] yield fd
def data_generator(dataset, predict=False, pad_batches=True): # iterbatches: Get an object that iterates over minibatches from the dataset. for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches(batch_size, pad_batches=pad_batches, deterministic=True)): # Concatenates list of ConvMol’s into one mol object that # can be used to feed into tensorflow placeholders. # agglomerate_mols -> mols: ConvMol objects to be combined into one molecule. multiConvMol = ConvMol.agglomerate_mols(X_b) # get_atom_features: Returns canonicalized version of atom features inputs = [ multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership) ] # Returns adjacency lists grouped by atom degree. for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): inputs.append(multiConvMol.get_deg_adjacency_lists()[i]) labels = [to_one_hot(y_b.flatten(), 2).reshape(-1, n_tasks, 2)] weights = [w_b] yield (inputs, labels, weights)
def amino_acid_embedding(self, name=None): if name == None: name = 'AAEmbedding_'+str(self.module_count)+'_' self.module_count += 1 feat = ConvMolFeaturizer() featurized_AA = [feat._featurize(Chem.MolFromSmiles(smile)) for smile in AminoAcid_SMILES] multiConvMol = ConvMol.agglomerate_mols(featurized_AA, max_deg=3) atom_features = TensorWrapper(tf.constant(multiConvMol.get_atom_features(), dtype=tf.float32), name=name+'atom_features') degree_slice = TensorWrapper(tf.constant(multiConvMol.deg_slice, dtype=tf.int32), name=name+'degree') membership = TensorWrapper(tf.constant(multiConvMol.membership, dtype=tf.int32), name=name+'membership') deg_adjs = [] for i in range(0, 3): deg_adjs.append(TensorWrapper(tf.constant(multiConvMol.get_deg_adjacency_lists()[i+1], dtype=tf.int32), name=name+'deg_'+str(i))) gc1 = GraphConv( 64, max_deg=3, activation_fn=tf.nn.relu, in_layers=[atom_features, degree_slice, membership] + deg_adjs, name=name+'gc1') batch_norm1 = BatchNorm(in_layers=[gc1, self.training_placeholder], name=name+'bn1') gp1 = GraphPool(max_degree=3, in_layers=[batch_norm1, degree_slice, membership] + deg_adjs, name=name+'gp1') gc2 = GraphConv( 64, max_deg=3, activation_fn=tf.nn.relu, in_layers=[gp1, degree_slice, membership] + deg_adjs, name=name+'gc2') batch_norm2 = BatchNorm(in_layers=[gc2, self.training_placeholder], name=name+'bn2') gp2 = GraphPool(max_degree=3, in_layers=[batch_norm2, degree_slice, membership] + deg_adjs, name=name+'gp2') dense = Dense(out_channels=self.embedding_length/2, activation_fn=tf.nn.relu, in_layers=[gp2], name=name+'dense1') batch_norm3 = BatchNorm(in_layers=[dense, self.training_placeholder], name=name+'bn3') readout = GraphGather( batch_size=21, activation_fn=tf.nn.tanh, in_layers=[batch_norm3, degree_slice, membership] + deg_adjs, name=name+'gg') padding = AminoAcidPad( embedding_length=self.embedding_length, in_layers=[readout], name=name+'pad') return padding
def default_generator(self, dataset, epochs=1, mode='fit', deterministic=True, pad_batches=True): for epoch in range(epochs): for (X_b, y_b, w_b, ids_b) in dataset.iterbatches( batch_size=self.batch_size, deterministic=deterministic, pad_batches=pad_batches): if self.mode == 'classification': y_b = to_one_hot(y_b.flatten(), self.n_classes).reshape( -1, self.n_tasks, self.n_classes) multiConvMol = ConvMol.agglomerate_mols(X_b) n_samples = np.array(X_b.shape[0]) inputs = [ multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership), n_samples ] for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): inputs.append(multiConvMol.get_deg_adjacency_lists()[i]) yield (inputs, [y_b], [w_b])
def default_generator(self, dataset, epochs=1, predict=False, pad_batches=True): for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches( self.batch_size, pad_batches=True, deterministic=True)): d = {} for index, label in enumerate(self.my_labels): if self.mode == 'classification': d[label] = to_one_hot(y_b[:, index]) if self.mode == 'regression': d[label] = np.expand_dims(y_b[:, index], -1) d[self.my_task_weights] = w_b multiConvMol = ConvMol.agglomerate_mols(X_b) d[self.atom_features] = multiConvMol.get_atom_features() d[self.degree_slice] = multiConvMol.deg_slice d[self.membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d
def data_generator(self, dataset, prior_label , task = None, num_prior = 0\ , epochs=1, pad_batches=True): """Data generator for training and evaluation""" for epoch in range(epochs): for ind, (X_b, y_b, w_b, ids_b) in enumerate( dataset.iterbatches( self.batch_size, pad_batches=pad_batches, deterministic=True)): d = {} for index, label in enumerate(self.labels): d[label] = to_one_hot(y_b[:, index]) #if epochs < 12: w_b = w_b*(0.1) + (w_b*y_b*self.scaled_w)/10.0 multiConvMol = ConvMol.agglomerate_mols(X_b[:,0]) circular_feat = X_b[:,1:] d[self.circular_feat] = circular_feat """Encode labels into the atom_features""" if prior_label: prior = [] if task is None: for e in range(self.batch_size): arr = np.zeros(self.num_task * 2) if random.random() < 0.5: index = random.sample(range(self.num_task), \ random.randint(0, self.num_task -1)) else: index = [] if len(index) != 0: for sth in index: if w_b[e,sth] != 0: if y_b[e,sth] == 1: arr[2*sth+1] = 1 else: arr[2*sth] = 1 w_b[e,sth] = w_b[e,sth]*0.001 prior.append(arr) else: for e in range(self.batch_size): arr = np.zeros(self.num_task * 2) list_t = list(range(self.num_task)) list_t.pop(task) index = random.sample(list_t, num_prior) if len(index) != 0: for sth in index: if w_b[e,sth] != 0: if y_b[e,sth] == 1: arr[2*sth+1] = 1 else: arr[2*sth] = 1 w_b[e,sth] = w_b[e,sth]*0.001 prior.append(arr) w_b[e,task] = 1.0 arr[2*task] = 0. arr[2*task+1] = 0. prior = np.array(prior) atom_feat = multiConvMol.get_atom_features() member = multiConvMol.membership new_atom_feats = [] for i in range(atom_feat.shape[0]): new_atom_feat = np.concatenate((atom_feat[i],\ prior[member[i]])) new_atom_feats.append(new_atom_feat) new_atom_feats = np.array(new_atom_feats) d[self.atom_features] = new_atom_feats else: d[self.atom_features] = multiConvMol.get_atom_features() d[self.weights] = w_b d[self.degree_slice] = multiConvMol.deg_slice d[self.membership] = multiConvMol.membership for i in range(1, len(multiConvMol.get_deg_adjacency_lists())): d[self.deg_adjs[i - 1]] = multiConvMol.get_deg_adjacency_lists()[i] yield d