def test_single_carbon(self): """Test that single carbon atom is featurized properly.""" raw_smiles = ['C'] mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mol_list = featurizer.featurize(mols) mol = mol_list[0] # Only one carbon assert mol.get_num_atoms() == 1 # No bonds, so degree adjacency lists are empty deg_adj_lists = mol.get_deg_adjacency_lists() assert np.array_equal(deg_adj_lists[0], np.zeros([1,0], dtype=np.int32)) assert np.array_equal(deg_adj_lists[1], np.zeros([0,1], dtype=np.int32)) assert np.array_equal(deg_adj_lists[2], np.zeros([0,2], dtype=np.int32)) assert np.array_equal(deg_adj_lists[3], np.zeros([0,3], dtype=np.int32)) assert np.array_equal(deg_adj_lists[4], np.zeros([0,4], dtype=np.int32)) assert np.array_equal(deg_adj_lists[5], np.zeros([0,5], dtype=np.int32)) assert np.array_equal(deg_adj_lists[6], np.zeros([0,6], dtype=np.int32))
def test_alkane(self): """Test on simple alkane""" raw_smiles = ['CCC'] mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mol_list = featurizer.featurize(mols) mol = mol_list[0] # 3 carbonds in alkane assert mol.get_num_atoms() == 3 deg_adj_lists = mol.get_deg_adjacency_lists() assert np.array_equal(deg_adj_lists[0], np.zeros([0,0], dtype=np.int32)) # Outer two carbonds are connected to central carbon assert np.array_equal(deg_adj_lists[1], np.array([[2], [2]], dtype=np.int32)) # Central carbon connected to outer two assert np.array_equal(deg_adj_lists[2], np.array([[0,1]], dtype=np.int32)) assert np.array_equal(deg_adj_lists[3], np.zeros([0,3], dtype=np.int32)) assert np.array_equal(deg_adj_lists[4], np.zeros([0,4], dtype=np.int32)) assert np.array_equal(deg_adj_lists[5], np.zeros([0,5], dtype=np.int32)) assert np.array_equal(deg_adj_lists[6], np.zeros([0,6], dtype=np.int32))
def test_graph_gather(self): """Test that GraphGather can be invoked.""" batch_size = 2 n_features = 75 n_atoms = 4 # In CCC and C, there are 4 atoms raw_smiles = ['CCC', 'C'] mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mols = featurizer.featurize(mols) multi_mol = ConvMol.agglomerate_mols(mols) atom_features = multi_mol.get_atom_features() degree_slice = multi_mol.deg_slice membership = multi_mol.membership deg_adjs = multi_mol.get_deg_adjacency_lists()[1:] with self.test_session() as sess: atom_features = tf.convert_to_tensor(atom_features, dtype=tf.float32) degree_slice = tf.convert_to_tensor(degree_slice, dtype=tf.int32) membership = tf.convert_to_tensor(membership, dtype=tf.int32) deg_adjs_tf = [] for deg_adj in deg_adjs: deg_adjs_tf.append( tf.convert_to_tensor(deg_adj, dtype=tf.int32)) args = [atom_features, degree_slice, membership] + deg_adjs_tf out_tensor = GraphGather(batch_size)(*args) sess.run(tf.global_variables_initializer()) out_tensor = out_tensor.eval() # TODO(rbharath): Why is it 2*n_features instead of n_features? assert out_tensor.shape == (batch_size, 2 * n_features)
def test_carbon_nitrogen(self): """Test on carbon nitrogen molecule""" # Note there is a central carbon of degree 4, with 3 carbons and # one nitrogen of degree 1 (connected only to central carbon). raw_smiles = ['C[N+](C)(C)C'] mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mols = featurizer.featurize(mols) mol = mols[0] # 5 atoms in compound assert mol.get_num_atoms() == 5 # Get the adjacency lists grouped by degree deg_adj_lists = mol.get_deg_adjacency_lists() assert np.array_equal(deg_adj_lists[0], np.zeros([0,0], dtype=np.int32)) # The 4 outer atoms connected to central carbon assert np.array_equal(deg_adj_lists[1], np.array([[4], [4], [4], [4]], dtype=np.int32)) assert np.array_equal(deg_adj_lists[2], np.zeros([0,2], dtype=np.int32)) assert np.array_equal(deg_adj_lists[3], np.zeros([0,3], dtype=np.int32)) # Central carbon connected to everything else. assert np.array_equal(deg_adj_lists[4], np.array([[0, 1, 2, 3]], dtype=np.int32)) assert np.array_equal(deg_adj_lists[5], np.zeros([0,5], dtype=np.int32)) assert np.array_equal(deg_adj_lists[6], np.zeros([0,6], dtype=np.int32))
def test_carbon_nitrogen(self): """Test on carbon nitrogen molecule""" # Note there is a central carbon of degree 4, with 3 carbons and # one nitrogen of degree 1 (connected only to central carbon). raw_smiles = ['C[N+](C)(C)C'] mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mols = featurizer.featurize(mols) mol = mols[0] # 5 atoms in compound assert mol.get_num_atoms() == 5 # Get the adjacency lists grouped by degree deg_adj_lists = mol.get_deg_adjacency_lists() assert np.array_equal(deg_adj_lists[0], np.zeros([0, 0], dtype=np.int32)) # The 4 outer atoms connected to central carbon assert np.array_equal(deg_adj_lists[1], np.array([[4], [4], [4], [4]], dtype=np.int32)) assert np.array_equal(deg_adj_lists[2], np.zeros([0, 2], dtype=np.int32)) assert np.array_equal(deg_adj_lists[3], np.zeros([0, 3], dtype=np.int32)) # Central carbon connected to everything else. assert np.array_equal(deg_adj_lists[4], np.array([[0, 1, 2, 3]], dtype=np.int32)) assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5], dtype=np.int32)) assert np.array_equal(deg_adj_lists[6], np.zeros([0, 6], dtype=np.int32))
def test_alkane(self): """Test on simple alkane""" raw_smiles = ['CCC'] import rdkit.Chem mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mol_list = featurizer.featurize(mols) mol = mol_list[0] # 3 carbonds in alkane assert mol.get_num_atoms() == 3 deg_adj_lists = mol.get_deg_adjacency_lists() assert np.array_equal(deg_adj_lists[0], np.zeros([0, 0], dtype=np.int32)) # Outer two carbonds are connected to central carbon assert np.array_equal(deg_adj_lists[1], np.array([[2], [2]], dtype=np.int32)) # Central carbon connected to outer two assert np.array_equal(deg_adj_lists[2], np.array([[0, 1]], dtype=np.int32)) assert np.array_equal(deg_adj_lists[3], np.zeros([0, 3], dtype=np.int32)) assert np.array_equal(deg_adj_lists[4], np.zeros([0, 4], dtype=np.int32)) assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5], dtype=np.int32)) assert np.array_equal(deg_adj_lists[6], np.zeros([0, 6], dtype=np.int32))
def test_single_carbon(self): """Test that single carbon atom is featurized properly.""" raw_smiles = ['C'] import rdkit mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mol_list = featurizer.featurize(mols) mol = mol_list[0] # Only one carbon assert mol.get_num_atoms() == 1 # No bonds, so degree adjacency lists are empty deg_adj_lists = mol.get_deg_adjacency_lists() assert np.array_equal(deg_adj_lists[0], np.zeros([1, 0], dtype=np.int32)) assert np.array_equal(deg_adj_lists[1], np.zeros([0, 1], dtype=np.int32)) assert np.array_equal(deg_adj_lists[2], np.zeros([0, 2], dtype=np.int32)) assert np.array_equal(deg_adj_lists[3], np.zeros([0, 3], dtype=np.int32)) assert np.array_equal(deg_adj_lists[4], np.zeros([0, 4], dtype=np.int32)) assert np.array_equal(deg_adj_lists[5], np.zeros([0, 5], dtype=np.int32)) assert np.array_equal(deg_adj_lists[6], np.zeros([0, 6], dtype=np.int32))
def test_graph_gather(self): """Test that GraphGather can be invoked.""" batch_size = 2 n_features = 75 n_atoms = 4 # In CCC and C, there are 4 atoms raw_smiles = ['CCC', 'C'] mols = [rdkit.Chem.MolFromSmiles(s) for s in raw_smiles] featurizer = ConvMolFeaturizer() mols = featurizer.featurize(mols) multi_mol = ConvMol.agglomerate_mols(mols) atom_features = multi_mol.get_atom_features() degree_slice = multi_mol.deg_slice membership = multi_mol.membership deg_adjs = multi_mol.get_deg_adjacency_lists()[1:] with self.session() as sess: atom_features = tf.convert_to_tensor(atom_features, dtype=tf.float32) degree_slice = tf.convert_to_tensor(degree_slice, dtype=tf.int32) membership = tf.convert_to_tensor(membership, dtype=tf.int32) deg_adjs_tf = [] for deg_adj in deg_adjs: deg_adjs_tf.append(tf.convert_to_tensor(deg_adj, dtype=tf.int32)) args = [atom_features, degree_slice, membership] + deg_adjs_tf out_tensor = GraphGather(batch_size)(*args) sess.run(tf.global_variables_initializer()) out_tensor = out_tensor.eval() # TODO(rbharath): Why is it 2*n_features instead of n_features? assert out_tensor.shape == (batch_size, 2 * n_features)
def test_per_atom_fragmentation(self): """checks if instantiating featurizer with per_atom_fragmentation=True leads to as many fragments' features, as many atoms mol has for any mol""" import rdkit.Chem raw_smiles = ['CC(CO)Cc1ccccc1', 'CC'] mols = [rdkit.Chem.MolFromSmiles(m) for m in raw_smiles] featurizer = ConvMolFeaturizer(per_atom_fragmentation=True) feat = featurizer.featurize(mols) for i, j in zip(feat, mols): assert len(i) == j.GetNumHeavyAtoms()
def predict_on_smiles(self, smiles, transformers=[], untransform=False): """Generates predictions on a numpy array of smile strings # Returns: y_: numpy ndarray of shape (n_samples, n_tasks) """ max_index = len(smiles) - 1 n_tasks = len(self.outputs) num_batches = (max_index // self.batch_size) + 1 featurizer = ConvMolFeaturizer() y_ = [] for i in range(num_batches): start = i * self.batch_size end = min((i + 1) * self.batch_size, max_index + 1) smiles_batch = smiles[start:end] y_.append( self.predict_on_smiles_batch(smiles_batch, featurizer, transformers)) y_ = np.concatenate(y_, axis=0)[:max_index + 1] y_ = y_.reshape(-1, n_tasks) if untransform: y_ = undo_transforms(y_, transformers) return y_
""" import warnings warnings.filterwarnings('ignore') import deepchem as dc #from deepchem.models.tensorgraph.models.graph_models import MPNNTensorGraph from deepchem.models.tensorgraph.models.graph_models import GraphConvModel #from deepchem.feat import WeaveFeaturizer from deepchem.feat.graph_features import ConvMolFeaturizer from deepchem.feat.graph_features import WeaveFeaturizer from deepchem.data.data_loader import CSVLoader import pandas as pd import numpy as np featurizer = ConvMolFeaturizer() #featurizer = WeaveFeaturizer(graph_distance=True, explicit_H=False) train_loader = CSVLoader(tasks=['LogD7.4'], smiles_field='smiles', featurizer=featurizer) test_loader = CSVLoader(tasks=['LogD7.4'], smiles_field='smiles', featurizer=featurizer) X_train = train_loader.featurize('../demo_data/reg/training_set.csv') X_test = test_loader.featurize('../demo_data/reg/testing_set.csv') model = GraphConvModel(n_tasks=1, mode='regression') model.fit(X_train) print(model.predict(X_test))