Exemplo n.º 1
0
    def test_graph_conv_atom_features(self):
        tasks, dataset, transformers, metric = self.get_dataset('regression',
                                                                'Raw',
                                                                num_tasks=1)

        atom_feature_name = 'feature'
        y = []
        for mol in dataset.X:
            atom_features = []
            for atom in mol.GetAtoms():
                val = np.random.normal()
                mol.SetProp(
                    "atom %08d %s" % (atom.GetIdx(), atom_feature_name),
                    str(val))
                atom_features.append(np.random.normal())
            y.append(np.sum(atom_features))

        featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name])
        X = featurizer.featurize(dataset.X)
        dataset = deepchem.data.NumpyDataset(X, np.array(y))
        batch_size = 50
        model = GraphConvModel(
            len(tasks),
            number_atom_features=featurizer.feature_length(),
            batch_size=batch_size,
            mode='regression')

        model.fit(dataset, nb_epoch=1)
        y_pred1 = model.predict(dataset)
        model.save()

        model2 = TensorGraph.load_from_dir(model.model_dir)
        y_pred2 = model2.predict(dataset)
        self.assertTrue(np.all(y_pred1 == y_pred2))
Exemplo n.º 2
0
def test_graph_conv_atom_features():
    tasks, dataset, transformers, metric = get_dataset('regression',
                                                       'Raw',
                                                       num_tasks=1)

    atom_feature_name = 'feature'
    y = []
    for mol in dataset.X:
        atom_features = []
        for atom in mol.GetAtoms():
            val = np.random.normal()
            mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name),
                        str(val))
            atom_features.append(np.random.normal())
        y.append([np.sum(atom_features)])

    featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name])
    X = featurizer.featurize(dataset.X)
    dataset = dc.data.NumpyDataset(X, np.array(y))
    batch_size = 50
    model = GraphConvModel(len(tasks),
                           number_atom_features=featurizer.feature_length(),
                           batch_size=batch_size,
                           mode='regression')

    model.fit(dataset, nb_epoch=1)
    y_pred1 = model.predict(dataset)
Exemplo n.º 3
0
  def test_graph_conv_atom_features(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'regression', 'Raw', num_tasks=1)

    atom_feature_name = 'feature'
    y = []
    for mol in dataset.X:
      atom_features = []
      for atom in mol.GetAtoms():
        val = np.random.normal()
        mol.SetProp("atom %08d %s" % (atom.GetIdx(), atom_feature_name),
                    str(val))
        atom_features.append(np.random.normal())
      y.append([np.sum(atom_features)])

    featurizer = ConvMolFeaturizer(atom_properties=[atom_feature_name])
    X = featurizer.featurize(dataset.X)
    dataset = dc.data.NumpyDataset(X, np.array(y))
    batch_size = 50
    model = GraphConvModel(
        len(tasks),
        number_atom_features=featurizer.feature_length(),
        batch_size=batch_size,
        mode='regression')

    model.fit(dataset, nb_epoch=1)
    y_pred1 = model.predict(dataset)
    model.save()

    model2 = TensorGraph.load_from_dir(model.model_dir)
    y_pred2 = model2.predict(dataset)
    self.assertTrue(np.allclose(y_pred1, y_pred2))
Exemplo n.º 4
0
def test_graph_conv_model_no_task():
    tasks, dataset, _, __ = get_dataset('classification', 'GraphConv')
    batch_size = 10
    model = GraphConvModel(len(tasks),
                           batch_size=batch_size,
                           batch_normalize=False,
                           mode='classification')
    model.fit(dataset, nb_epoch=20)
    # predict datset with no y (ensured by tasks = [])
    bace_url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv"
    dc.utils.data_utils.download_url(url=bace_url, name="bace_tmp.csv")
    loader = dc.data.CSVLoader(tasks=[],
                               smiles_field='mol',
                               featurizer=dc.feat.ConvMolFeaturizer())
    td = loader.featurize(
        os.path.join(dc.utils.data_utils.get_data_dir(), "bace_tmp.csv"))
    model.predict(td)
Exemplo n.º 5
0
    def test_neural_fingerprint_retrieval(self):
        tasks, dataset, transformers, metric = self.get_dataset(
            'classification', 'GraphConv')

        fp_size = 3

        batch_size = 50
        model = GraphConvModel(len(tasks),
                               batch_size=batch_size,
                               dense_layer_size=3,
                               mode='classification')

        model.fit(dataset, nb_epoch=1)
        neural_fingerprints = model.predict(dataset,
                                            outputs=model.neural_fingerprint)
        neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)]
        self.assertEqual((len(dataset), fp_size * 2),
                         neural_fingerprints.shape)
Exemplo n.º 6
0
  def test_neural_fingerprint_retrieval(self):
    tasks, dataset, transformers, metric = self.get_dataset(
        'classification', 'GraphConv')

    fp_size = 3

    batch_size = 50
    model = GraphConvModel(
        len(tasks),
        batch_size=batch_size,
        dense_layer_size=3,
        mode='classification')

    model.fit(dataset, nb_epoch=1)
    neural_fingerprints = model.predict(
        dataset, outputs=model.neural_fingerprint)
    neural_fingerprints = np.array(neural_fingerprints)[:len(dataset)]
    self.assertEqual((len(dataset), fp_size * 2), neural_fingerprints.shape)
Exemplo n.º 7
0
def test_graph_conv_model():
    batch_size = 2000
    model = GraphConvModel(1,
                           batch_size=batch_size,
                           mode="classification",
                           model_dir="/tmp/covid/model_dir")
    dataset_file = "covid_mpro_combined_data_sources.csv"
    tasks = ["isHit"]
    featurizer = dc.feat.ConvMolFeaturizer()
    loader = dc.data.CSVLoader(tasks=tasks,
                               smiles_field="SMILES",
                               featurizer=featurizer)
    dataset = loader.featurize(dataset_file, shard_size=8192)

    metrics = [
        dc.metrics.Metric(dc.metrics.matthews_corrcoef,
                          np.mean,
                          mode="classification")
    ]

    splitter = dc.splits.RandomSplitter()

    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset)

    model.fit(train_dataset)

    pred = [x.flatten() for x in model.predict(valid_dataset)]
    pred_df = pd.DataFrame(pred, columns=["neg", "pos"])
    pred_df["active"] = [int(x) for x in valid_dataset.y]
    pred_df["SMILES"] = valid_dataset.ids

    sns.boxplot(pred_df.active, pred_df.pos)

    print(model.evaluate(train_dataset, metrics))
    print(model.evaluate(test_dataset, metrics))

    metrics = [
        dc.metrics.Metric(dc.metrics.roc_auc_score,
                          np.mean,
                          mode="classification")
    ]
    print(model.evaluate(train_dataset, metrics))
    print(model.evaluate(test_dataset, metrics))
Exemplo n.º 8
0
#print(len(losses))
plt.figure(1)
fig, ax = plt.subplots(2, sharex='col', sharey='row')
x = range(num_epochs)
y_loss = losses_train
ax[0].plot(x, y_loss, c='b', alpha=0.6, label='loss_train')
ax[0].set(xlabel='epoch', ylabel='loss')

y_score = score_train
ax[1].plot(x, y_score, c='r', alpha=0.6, label='score_valid')
ax[1].set(xlabel='epoch', ylabel='R2 score')

###Real value and predicted value for target
train_y = train_dataset.y
train_pred = model.predict(train_dataset)

test1_y = test1_dataset.y
test1_pred = model.predict(test1_dataset)

test2_y = test2_dataset.y
test2_pred = model.predict(test2_dataset)

##evaluation model

dcmodel = "DeepChem"
# Scores of Train Data
train_mae = mean_absolute_error(
    dc.trans.undo_transforms(train_y, [transformers_train]),
    dc.trans.undo_transforms(train_pred, [transformers_train]))
train_rmse = mean_squared_error(
Exemplo n.º 9
0
model.fit(dataset)

#model = GraphConvModel(1, batch_size=128,mode="classification",model_dir="/tmp/mk01/model_dir")
#model.restore()
#make predictions
featurizer = dc.feat.ConvMolFeaturizer()
df = pd.read_csv("emol_10M.csv", sep=",")
#print('num rows in file',df.size)
#df.columns=["SMILES","Name"]

rows, cols = df.shape
df["Val"] = [
    0
] * rows  #just add add a dummy column to keep the featurizer happy
infile_name = "emol_10M_withVal.csv"
df.to_csv(infile_name, index=False)
loader = dc.data.CSVLoader(tasks=['Val'],
                           smiles_field="isosmiles",
                           featurizer=featurizer)
dataset = loader.featurize(infile_name, shard_size=8192)
pred = model.predict(dataset)
pred_df = pd.DataFrame([x.flatten() for x in pred], columns=["Neg", "Pos"])
sns.distplot(pred_df.Pos, rug=True)
combo_df = df.join(pred_df, how="outer")
combo_df.sort_values("Pos", inplace=True, ascending=False)
#PandasTools.AddMoleculeColumnToFrame(combo_df,"isosmiles","Mol")

combo_df = combo_df.loc[combo_df['Pos'] >= 0.8]
combo_df.to_csv('emol_10M_output_hits.csv', sep=',')