def test_graph_conv_atom_features(self): tasks, dataset, transformers, metric = self.get_dataset('regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp( "atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append(np.sum(atom_features)) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = deepchem.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvModel( len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset) model.save() model2 = TensorGraph.load_from_dir(model.model_dir) y_pred2 = model2.predict(dataset) self.assertTrue(np.all(y_pred1 == y_pred2))
def test_graph_conv_atom_features(): tasks, dataset, transformers, metric = get_dataset('regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append([np.sum(atom_features)]) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = dc.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvModel(len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset)
def test_graph_conv_atom_features(self): tasks, dataset, transformers, metric = self.get_dataset( 'regression', 'Raw', num_tasks=1) atom_feature_name = 'feature' y = [] for mol in dataset.X: atom_features = [] for atom in mol.GetAtoms(): val = np.random.normal() mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name), str(val)) atom_features.append(np.random.normal()) y.append([np.sum(atom_features)]) featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name]) X = featurizer.featurize(dataset.X) dataset = dc.data.NumpyDataset(X, np.array(y)) batch_size = 50 model = GraphConvModel( len(tasks), number_atom_features=featurizer.feature_length(), batch_size=batch_size, mode='regression') model.fit(dataset, nb_epoch=1) y_pred1 = model.predict(dataset) model.save() model2 = TensorGraph.load_from_dir(model.model_dir) y_pred2 = model2.predict(dataset) self.assertTrue(np.allclose(y_pred1, y_pred2))
def test_graph_conv_model_no_task(): tasks, dataset, _, __ = get_dataset('classification', 'GraphConv') batch_size = 10 model = GraphConvModel(len(tasks), batch_size=batch_size, batch_normalize=False, mode='classification') model.fit(dataset, nb_epoch=20) # predict datset with no y (ensured by tasks = []) bace_url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv" dc.utils.data_utils.download_url(url=bace_url, name="bace_tmp.csv") loader = dc.data.CSVLoader(tasks=[], smiles_field='mol', featurizer=dc.feat.ConvMolFeaturizer()) td = loader.featurize( os.path.join(dc.utils.data_utils.get_data_dir(), "bace_tmp.csv")) model.predict(td)
def test_neural_fingerprint_retrieval(self): tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') fp_size = 3 batch_size = 50 model = GraphConvModel(len(tasks), batch_size=batch_size, dense_layer_size=3, mode='classification') model.fit(dataset, nb_epoch=1) neural_fingerprints = model.predict(dataset, outputs=model.neural_fingerprint) neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)] self.assertEqual((len(dataset), fp_size * 2), neural_fingerprints.shape)
def test_neural_fingerprint_retrieval(self): tasks, dataset, transformers, metric = self.get_dataset( 'classification', 'GraphConv') fp_size = 3 batch_size = 50 model = GraphConvModel( len(tasks), batch_size=batch_size, dense_layer_size=3, mode='classification') model.fit(dataset, nb_epoch=1) neural_fingerprints = model.predict( dataset, outputs=model.neural_fingerprint) neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)] self.assertEqual((len(dataset), fp_size * 2), neural_fingerprints.shape)
def test_graph_conv_model(): batch_size = 2000 model = GraphConvModel(1, batch_size=batch_size, mode="classification", model_dir="/tmp/covid/model_dir") dataset_file = "covid_mpro_combined_data_sources.csv" tasks = ["isHit"] featurizer = dc.feat.ConvMolFeaturizer() loader = dc.data.CSVLoader(tasks=tasks, smiles_field="SMILES", featurizer=featurizer) dataset = loader.featurize(dataset_file, shard_size=8192) metrics = [ dc.metrics.Metric(dc.metrics.matthews_corrcoef, np.mean, mode="classification") ] splitter = dc.splits.RandomSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset) model.fit(train_dataset) pred = [x.flatten() for x in model.predict(valid_dataset)] pred_df = pd.DataFrame(pred, columns=["neg", "pos"]) pred_df["active"] = [int(x) for x in valid_dataset.y] pred_df["SMILES"] = valid_dataset.ids sns.boxplot(pred_df.active, pred_df.pos) print(model.evaluate(train_dataset, metrics)) print(model.evaluate(test_dataset, metrics)) metrics = [ dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") ] print(model.evaluate(train_dataset, metrics)) print(model.evaluate(test_dataset, metrics))
#print(len(losses)) plt.figure(1) fig, ax = plt.subplots(2, sharex='col', sharey='row') x = range(num_epochs) y_loss = losses_train ax[0].plot(x, y_loss, c='b', alpha=0.6, label='loss_train') ax[0].set(xlabel='epoch', ylabel='loss') y_score = score_train ax[1].plot(x, y_score, c='r', alpha=0.6, label='score_valid') ax[1].set(xlabel='epoch', ylabel='R2 score') ###Real value and predicted value for target train_y = train_dataset.y train_pred = model.predict(train_dataset) test1_y = test1_dataset.y test1_pred = model.predict(test1_dataset) test2_y = test2_dataset.y test2_pred = model.predict(test2_dataset) ##evaluation model dcmodel = "DeepChem" # Scores of Train Data train_mae = mean_absolute_error( dc.trans.undo_transforms(train_y, [transformers_train]), dc.trans.undo_transforms(train_pred, [transformers_train])) train_rmse = mean_squared_error(
model.fit(dataset) #model = GraphConvModel(1, batch_size=128,mode="classification",model_dir="/tmp/mk01/model_dir") #model.restore() #make predictions featurizer = dc.feat.ConvMolFeaturizer() df = pd.read_csv("emol_10M.csv", sep=",") #print('num rows in file',df.size) #df.columns=["SMILES","Name"] rows, cols = df.shape df["Val"] = [ 0 ] * rows #just add add a dummy column to keep the featurizer happy infile_name = "emol_10M_withVal.csv" df.to_csv(infile_name, index=False) loader = dc.data.CSVLoader(tasks=['Val'], smiles_field="isosmiles", featurizer=featurizer) dataset = loader.featurize(infile_name, shard_size=8192) pred = model.predict(dataset) pred_df = pd.DataFrame([x.flatten() for x in pred], columns=["Neg", "Pos"]) sns.distplot(pred_df.Pos, rug=True) combo_df = df.join(pred_df, how="outer") combo_df.sort_values("Pos", inplace=True, ascending=False) #PandasTools.AddMoleculeColumnToFrame(combo_df,"isosmiles","Mol") combo_df = combo_df.loc[combo_df['Pos'] >= 0.8] combo_df.to_csv('emol_10M_output_hits.csv', sep=',')