def test_graph_conv_atom_features(self): tasks, dataset, transformers, metric = self.get_dataset('regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp( "atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append(np.sum(atom_features)) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = deepchem.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvModel( len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset) model.save() model2 = TensorGraph.load_from_dir(model.model_dir) y_pred2 = model2.predict(dataset) self.assertTrue(np.all(y_pred1 == y_pred2))
def test_graph_conv_atom_features(): tasks, dataset, transformers, metric = get_dataset('regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append([np.sum(atom_features)]) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = dc.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvModel(len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset)
def test_graph_conv_atom_features(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append([np.sum(atom_features)]) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = dc.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvModel( len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset) model.save() model2 = TensorGraph.load_from_dir(model.model_dir) y_pred2 = model2.predict(dataset) self.assertTrue(np.allclose(y_pred1, y_pred2))
def generate_graph_conv_model(): batch_size = 128 model = GraphConvModel(1, batch_size=batch_size, mode="classification", model_dir="/tmp/mk01/model_dir") dataset_file = "dude_ace.csv" tasks = ["is_active"] featurizer = dc.feat.ConvMolFeaturizer() loader = dc.data.CSVLoader(tasks=tasks, smiles_field="SMILES", featurizer=featurizer) dataset = loader.featurize(dataset_file, shard_size=8192) splitter = dc.splits.RandomSplitter() metrics = [ dc.metrics.Metric(dc.metrics.matthews_corrcoef, np.mean, mode="classification") ] training_score_list = [] validation_score_list = [] transformers = [] model.fit(dataset) print(model.evaluate(dataset, metrics)) return model
def test_graph_conv_regression_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert all(s < 0.1 for s in scores['mean_absolute_error'])
def test_graph_conv_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='classification') model.fit(dataset, nb_epoch=10) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.9
def test_graph_conv_regression_model(): tasks, dataset, transformers, metric = get_dataset('regression', 'GraphConv') batch_size = 10 model = GraphConvModel(len(tasks), batch_size=batch_size, batch_normalize=False, mode='regression') model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean_absolute_error'] < 0.1
def test_graph_conv_error_bars(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv', num_tasks=1) batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) mu, sigma = model.bayesian_predict( dataset, transformers, untransform=True, n_passes=24) assert mu.shape == (len(dataset), len(tasks)) assert sigma.shape == (len(dataset), len(tasks))
def test_graph_conv_regression_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) scores = model.evaluate(dataset, [metric], transformers) model.save() model = TensorGraph.load_from_dir(model.model_dir) scores = model.evaluate(dataset, [metric], transformers)
def test_neural_fingerprint_retrieval(): tasks, dataset, transformers, metric = get_dataset('classification', 'GraphConv') fp_size = 3 batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, dense_layer_size=3, mode='classification') model.fit(dataset, nb_epoch=1) neural_fingerprints = model.predict_embedding(dataset) neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)] assert (len(dataset), fp_size * 2) == neural_fingerprints.shape
def test_graph_conv_regression_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert all(s < 0.1 for s in scores['mean_absolute_error']) model.save() model = TensorGraph.load_from_dir(model.model_dir) scores2 = model.evaluate(dataset, [metric], transformers) assert np.allclose(scores['mean_absolute_error'], scores2['mean_absolute_error'])
def test_change_loss_function(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv', num_tasks=1) batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) model.save() model2 = TensorGraph.load_from_dir(model.model_dir, restore=False) dummy_label = model2.labels[-1] dummy_ouput = model2.outputs[-1] loss = ReduceSum(L2Loss(in_layers=[dummy_label, dummy_ouput])) module = model2.create_submodel(loss=loss) model2.restore() model2.fit(dataset, nb_epoch=1, submodel=module)
def test_graph_conv_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') batch_size = 50 model = GraphConvModel( len(tasks), batch_size=batch_size, mode='classification') model.fit(dataset, nb_epoch=10) scores = model.evaluate(dataset, [metric], transformers) assert scores['mean-roc_auc_score'] >= 0.9 model.save() model = TensorGraph.load_from_dir(model.model_dir) scores2 = model.evaluate(dataset, [metric], transformers) assert np.allclose(scores['mean-roc_auc_score'], scores2['mean-roc_auc_score'])
def test_graph_conv_model_no_task(): tasks, dataset, _, __ = get_dataset('classification', 'GraphConv') batch_size = 10 model = GraphConvModel(len(tasks), batch_size=batch_size, batch_normalize=False, mode='classification') model.fit(dataset, nb_epoch=20) # predict datset with no y (ensured by tasks = []) bace_url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv" dc.utils.data_utils.download_url(url=bace_url, name="bace_tmp.csv") loader = dc.data.CSVLoader(tasks=[], smiles_field='mol', featurizer=dc.feat.ConvMolFeaturizer()) td = loader.featurize( os.path.join(dc.utils.data_utils.get_data_dir(), "bace_tmp.csv")) model.predict(td)
def test_neural_fingerprint_retrieval(self): tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') fp_size = 3 batch_size = 50 model = GraphConvModel( len(tasks), batch_size=batch_size, dense_layer_size=3, mode='classification') model.fit(dataset, nb_epoch=1) neural_fingerprints = model.predict( dataset, outputs=model.neural_fingerprint) neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)] self.assertEqual((len(dataset), fp_size * 2), neural_fingerprints.shape)
def test_neural_fingerprint_retrieval(self): tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') fp_size = 3 batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, dense_layer_size=3, mode='classification') model.fit(dataset, nb_epoch=1) neural_fingerprints = model.predict(dataset, outputs=model.neural_fingerprint) neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)] self.assertEqual((len(dataset), fp_size * 2), neural_fingerprints.shape)
def test_graph_conv_regression_model(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=100) scores = model.evaluate(dataset, [metric], transformers) assert all(s < 0.1 for s in scores['mean_absolute_error']) model.save() model = TensorGraph.load_from_dir(model.model_dir) scores2 = model.evaluate(dataset, [metric], transformers) assert np.allclose( scores['mean_absolute_error'], scores2['mean_absolute_error'], rtol=1e-4)
def test_graph_conv_model(): batch_size = 2000 model = GraphConvModel(1, batch_size=batch_size, mode="classification", model_dir="/tmp/covid/model_dir") dataset_file = "covid_mpro_combined_data_sources.csv" tasks = ["isHit"] featurizer = dc.feat.ConvMolFeaturizer() loader = dc.data.CSVLoader(tasks=tasks, smiles_field="SMILES", featurizer=featurizer) dataset = loader.featurize(dataset_file, shard_size=8192) metrics = [ dc.metrics.Metric(dc.metrics.matthews_corrcoef, np.mean, mode="classification") ] splitter = dc.splits.RandomSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset) model.fit(train_dataset) pred = [x.flatten() for x in model.predict(valid_dataset)] pred_df = pd.DataFrame(pred, columns=["neg", "pos"]) pred_df["active"] = [int(x) for x in valid_dataset.y] pred_df["SMILES"] = valid_dataset.ids sns.boxplot(pred_df.active, pred_df.pos) print(model.evaluate(train_dataset, metrics)) print(model.evaluate(test_dataset, metrics)) metrics = [ dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") ] print(model.evaluate(train_dataset, metrics)) print(model.evaluate(test_dataset, metrics))
def test_graph_conv_regression_uncertainty(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression', dropout=0.1, uncertainty=True) model.fit(dataset, nb_epoch=100) # Predict the output and uncertainty. pred, std = model.predict_uncertainty(dataset) mean_error = np.mean(np.abs(dataset.y - pred)) mean_value = np.mean(np.abs(dataset.y)) mean_std = np.mean(std) assert mean_error < 0.5 * mean_value assert mean_std > 0.5 * mean_error assert mean_std < mean_value
def test_graph_conv_regression_uncertainty(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'GraphConv') batch_size = 50 model = GraphConvModel( len(tasks), batch_size=batch_size, mode='regression', dropout=0.1, uncertainty=True) model.fit(dataset, nb_epoch=100) # Predict the output and uncertainty. pred, std = model.predict_uncertainty(dataset) mean_error = np.mean(np.abs(dataset.y - pred)) mean_value = np.mean(np.abs(dataset.y)) mean_std = np.mean(std) assert mean_error < 0.5 * mean_value assert mean_std > 0.5 * mean_error assert mean_std < mean_value
from deepchem.molnet import load_delaney # Load Delaney dataset delaney_tasks, delaney_datasets, transformers = load_delaney( featurizer='GraphConv', split='index') train_dataset, valid_dataset, test_dataset = delaney_datasets # Fit models metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean) # Do setup required for tf/keras models # Number of features on conv-mols n_feat = 75 # Batch size of models batch_size = 128 model = GraphConvModel( len(delaney_tasks), batch_size=batch_size, mode='regression') # Fit trained model model.fit(train_dataset, nb_epoch=20) print("Evaluating model") train_scores = model.evaluate(train_dataset, [metric], transformers) valid_scores = model.evaluate(valid_dataset, [metric], transformers) print("Train scores") print(train_scores) print("Validation scores") print(valid_scores)
#train the model batch_size = 2000 model = GraphConvModel(1, batch_size=batch_size, mode="classification", model_dir="/tmp/covid/model_dir") dataset_file = "covid_mpro_combined_data_sources.csv" tasks = ["isHit"] featurizer = dc.feat.ConvMolFeaturizer() loader = dc.data.CSVLoader(tasks=tasks, smiles_field="SMILES", featurizer=featurizer) dataset = loader.featurize(dataset_file, shard_size=8192) model.fit(dataset) #model = GraphConvModel(1, batch_size=128,mode="classification",model_dir="/tmp/mk01/model_dir") #model.restore() #make predictions featurizer = dc.feat.ConvMolFeaturizer() df = pd.read_csv("emol_10M.csv", sep=",") #print('num rows in file',df.size) #df.columns=["SMILES","Name"] rows, cols = df.shape df["Val"] = [ 0 ] * rows #just add add a dummy column to keep the featurizer happy infile_name = "emol_10M_withVal.csv" df.to_csv(infile_name, index=False)
# Number of features n_feat = 75 # Batch size of models batch_size = 128 model = GraphConvModel(len(delaney_tasks), batch_size=batch_size, mode='regression', dropout=0.2) # In[5]: # Fit trained model model.fit(train, nb_epoch=100) print("Evaluating model") train_scores = model.evaluate(train, [metric], transformers) valid_scores = model.evaluate(valid, [metric], transformers) print("Train scores") print(train_scores) print("Validation scores") print(valid_scores) # In[6]: #Predictability test for trained model
model_dir=model_dir, random_seed=0) metric = dc.metrics.Metric(dc.metrics.r2_score, mode='regression') ckpt = tf.train.Checkpoint(step=tf.Variable(1)) manager = tf.train.CheckpointManager(ckpt, model_dir, max_to_keep=20) start_time = time.time() num_epochs = 100 losses_train = [] score_valid = [] score_train = [] for i in range(num_epochs): loss_train = model.fit(train_dataset, nb_epoch=1, deterministic=True) ckpt.step.assign_add(1) save_path = manager.save() print("Saved checkpoint for step {}: {} ".format(int(ckpt.step), save_path)) model.save_checkpoint(max_checkpoints_to_keep=20, model_dir=save_path) #model.restore() R2_train = model.evaluate(train_dataset, [metric])['r2_score'] R2_valid = model.evaluate(valid_dataset, [metric])['r2_score'] print("Epoch %d loss_train: %f R2_train %f R2_valid: %f " % (i, loss_train, R2_train, R2_valid)) losses_train.append(loss_train) score_valid.append(R2_valid) score_train.append(R2_train)