Пример #1
0
def experiment(dataset_file, method='GraphConv', split='scaffold'):
    featurizer = 'ECFP'
    if method == 'GraphConv':
        featurizer = 'GraphConv'
    tasks, datasets, transformers = load_dataset(dataset_file,
                                                 featurizer=featurizer,
                                                 split=split)
    train, val, test = datasets

    model = None
    if method == 'GraphConv':
        model = GraphConvModel(len(tasks),
                               batch_size=BATCH_SIZE,
                               mode="regression")
    elif method == 'RF':

        def model_builder_rf(model_dir):
            sklearn_model = RandomForestRegressor(n_estimators=100)
            return dc.models.SklearnModel(sklearn_model, model_dir)

        model = dc.models.SingletaskToMultitask(tasks, model_builder_rf)
    elif method == 'SVR':

        def model_builder_svr(model_dir):
            sklearn_model = svm.SVR(kernel='linear')
            return dc.models.SklearnModel(sklearn_model, model_dir)

        model = dc.models.SingletaskToMultitask(tasks, model_builder_svr)

    return model, train, val, test, transformers
Пример #2
0
def gc_model_builder(model_params, model_dir):
    gc_model = GraphConvModel(**model_params, model_dir="./models")
    return gc_model
Пример #3
0
np.random.seed(123)

tf.set_random_seed(123)
import deepchem as dc
from deepchem.data.datasets import NumpyDataset
from deepchem.models.tensorgraph.models.graph_models import GraphConvModel

model_dir = os.path.join(os.path.dirname(__file__), "..", "model")

# Create the featurizer and transformer
with open(os.path.join(model_dir, '..', 'tasks.json'), 'r') as fp:
    tasks = json.load(fp)

# Batch size of models
model = GraphConvModel(12,
                       mode='classification',
                       model_dir=model_dir,
                       batch_size=128)
model.restore()


# Make the inference functions
def invoke_model(feats: np.array, smiles: List[str]) -> [dict]:
    """Invoke the model
    
    Args:
        feats (np.array): Features for the model
        smiles ([str]): SMILES 
    Returns:
        ([dict]) Return the data
    """
    # Turn the features into a Numpy dataset
Пример #4
0
"""

import warnings
warnings.filterwarnings('ignore')

import deepchem as dc
#from deepchem.models.tensorgraph.models.graph_models import MPNNTensorGraph
from deepchem.models.tensorgraph.models.graph_models import GraphConvModel
#from deepchem.feat import WeaveFeaturizer
from deepchem.feat.graph_features import ConvMolFeaturizer
from deepchem.feat.graph_features import WeaveFeaturizer
from deepchem.data.data_loader import CSVLoader

import pandas as pd
import numpy as np

featurizer = ConvMolFeaturizer()
#featurizer = WeaveFeaturizer(graph_distance=True, explicit_H=False)
train_loader = CSVLoader(tasks=['LogD7.4'],
                         smiles_field='smiles',
                         featurizer=featurizer)
test_loader = CSVLoader(tasks=['LogD7.4'],
                        smiles_field='smiles',
                        featurizer=featurizer)

X_train = train_loader.featurize('../demo_data/reg/training_set.csv')
X_test = test_loader.featurize('../demo_data/reg/testing_set.csv')

model = GraphConvModel(n_tasks=1, mode='regression')
model.fit(X_train)
print(model.predict(X_test))
# Load Tox21 dataset
tox21_tasks, tox21_datasets, transformers = load_tox21(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = tox21_datasets
print(train_dataset.data_dir)
print(valid_dataset.data_dir)

# Fit models
metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                           np.mean,
                           mode="classification")

# Batch size of models
batch_size = 50

model = GraphConvModel(len(tox21_tasks),
                       batch_size=batch_size,
                       mode='classification')

model.fit(train_dataset, nb_epoch=10)

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)
support_generator = dc.data.SupportGenerator(test_dataset, n_pos, n_neg,
                                             n_trials)

# Compute accuracies

task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}

for trial_num, (task, support) in enumerate(support_generator):
  print("Starting trial %d" % trial_num)

  # Number of features on conv-mols
  n_feat = 75
  # Batch size of models
  batch_size = 50
  #graph_model = dc.nn.SequentialGraph(n_feat)
  model = GraphConvModel(
      1, graph_conv_layers=[64, 128, 64], batch_size=batch_size)
  # Fit trained model
  model.fit(support, nb_epoch=10)

  # Test model
  task_dataset = dc.data.get_task_dataset_minus_support(test_dataset, support,
                                                        task)
  y_pred = model.predict(task_dataset)
  score = metric.compute_metric(task_dataset.y, y_pred, task_dataset.w)
  print("Score on task %s is %s" % (str(task), str(score)))
  task_scores[task].append(score)

# Join information for all tasks.
mean_task_scores = {}
std_task_scores = {}
for task in range(len(test_dataset.get_task_names())):
                                             n_trials)

# Compute accuracies

task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}

for trial_num, (task, support) in enumerate(support_generator):
    print("Starting trial %d" % trial_num)

    # Number of features on conv-mols
    n_feat = 75
    # Batch size of models
    batch_size = 50
    #graph_model = dc.nn.SequentialGraph(n_feat)
    model = GraphConvModel(1,
                           graph_conv_layers=[64, 128, 64],
                           batch_size=batch_size)
    # Fit trained model
    model.fit(support, nb_epoch=10)

    # Test model
    task_dataset = dc.data.get_task_dataset_minus_support(
        test_dataset, support, task)
    y_pred = model.predict(task_dataset)
    score = metric.compute_metric(task_dataset.y, y_pred, task_dataset.w)
    print("Score on task %s is %s" % (str(task), str(score)))
    task_scores[task].append(score)

# Join information for all tasks.
mean_task_scores = {}
std_task_scores = {}
Пример #8
0
def graph_conv_training():
    graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer()

    loader = dc.data.data_loader.CSVLoader(tasks=[t_task.get()],
                                           smiles_field=t_smiles.get(),
                                           id_field=t_id.get(),
                                           featurizer=graph_featurizer)
    dataset = loader.featurize(t_csv.get())

    splitter = dc.splits.splitters.RandomSplitter()
    trainset, testset = splitter.train_test_split(dataset)

    hp = dc.molnet.preset_hyper_parameters
    param = hp.hps['graphconvreg']
    print(param)

    batch_size = 48

    from deepchem.models.tensorgraph.models.graph_models import GraphConvModel
    model = GraphConvModel(n_tasks=1,
                           batch_size=64,
                           uncertainty=False,
                           mode='regression')

    model = dc.models.GraphConvTensorGraph(1,
                                           batch_size=batch_size,
                                           learning_rate=1e-3,
                                           use_queue=False,
                                           mode='regression',
                                           model_dir=t_savename.get())

    np.random.seed(1)
    random.seed(1)

    model.fit(dataset, nb_epoch=max(1, int(t_epochs.get())))
    #model.fit(trainset, nb_epoch=max(1, int(t_epochs.get())))

    metric = dc.metrics.Metric(dc.metrics.r2_score)

    print('epoch: ', t_epochs.get())
    print("Evaluating model")
    train_score = model.evaluate(trainset, [metric])
    test_score = model.evaluate(testset, [metric])

    model.save()

    pred_train = model.predict(trainset)
    pred_test = model.predict(testset)

    y_train = np.array(trainset.y, dtype=np.float32)
    y_test = np.array(testset.y, dtype=np.float32)

    import matplotlib.pyplot as plt

    plt.figure()

    plt.figure(figsize=(5, 5))

    plt.scatter(y_train, pred_train, label='Train', c='blue')
    plt.title('Graph Convolution')
    plt.xlabel('Measured value')
    plt.ylabel('Predicted value')
    plt.scatter(y_test, pred_test, c='lightgreen', label='Test', alpha=0.8)
    plt.legend(loc=4)
    #plt.show()
    plt.savefig('score-tmp.png')

    from PIL import Image
    img = Image.open('score-tmp.png')

    img_resize = img.resize((400, 400), Image.LANCZOS)
    img_resize.save('score-tmp.png')

    global image_score
    image_score_open = Image.open('score-tmp.png')
    image_score = ImageTk.PhotoImage(image_score_open, master=frame1)

    canvas.create_image(200, 200, image=image_score)

    #Calculate R2 score
    print("Train score")
    print(train_score)
    t_train_r2.set(train_score)

    print("Test scores")
    print(test_score)
    t_test_r2.set(test_score)

    #Calculate RMSE
    train_rmse = 1
    test_rmse = 1
    '''
    print("Train RMSE")
    print(train_rmse)
    t_train_rmse.set(train_rmse)

    print("Test RMSE")
    print(test_rmse)
    t_test_rmse.set(test_rmse)
    '''

    df_save = pd.DataFrame({'pred_train': pred_train, 'meas_train': y_train})

    df_save.to_csv('pred_and_meas_train.csv')

    print('finish!')
tf.set_random_seed(123)
import deepchem as dc
from deepchem.molnet import load_tox21
from deepchem.models.tensorgraph.models.graph_models import GraphConvModel

model_dir = "model"

# Load Tox21 dataset
tox21_tasks, tox21_datasets, transformers = load_tox21(featurizer='GraphConv')
with open('tasks.json', 'w') as fp:
    json.dump(tox21_tasks, fp)
train_dataset, valid_dataset, test_dataset = tox21_datasets

# Fit models
metric = dc.metrics.Metric(dc.metrics.roc_auc_score,
                           np.mean,
                           mode="classification")

# Batch size of models
batch_size = 50

model = GraphConvModel(len(tox21_tasks),
                       batch_size=batch_size,
                       mode='classification',
                       model_dir=model_dir)

model.fit(train_dataset, nb_epoch=50)

model.save()
Пример #10
0
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, np.mean, mode="classification")

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
print("Training ROC-AUC Score: %f" % train_scores["mean-roc_auc_score"])
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
print("Validation ROC-AUC Score: %f" % valid_scores["mean-roc_auc_score"])

'''

###########################################################################################
# Load HIV dataset
hiv_tasks, hiv_datasets, transformers = dc.molnet.load_hiv(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = hiv_datasets
model = GraphConvModel(
    len(hiv_tasks), batch_size=70, mode='classification')
# Set nb_epoch=10 for better results.
model.fit(train_dataset, nb_epoch=1)
metric = dc.metrics.Metric(
    dc.metrics.roc_auc_score, np.mean, mode="classification")

print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
print("Training ROC-AUC Score: %f" % train_scores["mean-roc_auc_score"])
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
print("Validation ROC-AUC Score: %f" % valid_scores["mean-roc_auc_score"])

'''
############################################################################################
# Load SAMPL(FreeSolv) dataset
SAMPL_tasks, SAMPL_datasets, transformers = dc.molnet.load_sampl(
Пример #11
0
valid_dataset = loader.featurize( '../data/dw_acidic_unique_valid.csv' )
test_dataset = loader.featurize( '../data/dw_acidic_unique_test.csv' )

transformers = [
        dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]

for dataset in [train_dataset, valid_dataset, test_dataset]:
        for transformer in transformers:
                    dataset = transformer.transform(dataset)

# print('shape of the dataset')
# print(train_dataset.X.shape)
# print(train_dataset.X[0].get_atom_features())
# print(train_dataset.X[0].get_atom_features().shape)

model = GraphConvModel.load_from_dir('models')
model.restore()

train_scores = model.evaluate(
                train_dataset,
                [dc.metrics.Metric(dc.metrics.rms_score),
                 dc.metrics.Metric(dc.metrics.r2_score),
                 dc.metrics.Metric(dc.metrics.mae_score)]
                )
print('train scores')
print(train_scores)

valid_scores = model.evaluate(
                valid_dataset,
                [dc.metrics.Metric(dc.metrics.rms_score),
                 dc.metrics.Metric(dc.metrics.r2_score),
Пример #12
0
test_dataset = loader.featurize('../data/dw_acidic_unique_test.csv')

# splitter = dc.splits.RandomSplitter()
# train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset, seed=42)

transformers = [
    dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)
]

for dataset in [train_dataset, valid_dataset, test_dataset]:
    for transformer in transformers:
        dataset = transformer.transform(dataset)

model = GraphConvModel(n_tasks=1,
                       mode='regression',
                       tensorboard=True,
                       model_dir='models/',
                       dropout=0.5,
                       graph_conv_layers=[64, 64])

# Need to use the following hackish code to track the validation loss
# while fitting with DeepChem, this is how I track overfitting.
valid_loss = 10000000
while valid_loss > 50:
    # Fit trained model
    model.fit(train_dataset, nb_epoch=1)
    # checkpoint_interval causes the model not to save a checkpoint.
    valid_loss = model.fit(valid_dataset, checkpoint_interval=0)
    print("valid loss: ", valid_loss)
    # This will restore the model to the fit from the train dataset
    model.restore()
    model.save()
Пример #13
0
    ]

    #Now we just store as Numpy Dataset? Or maybe I don't need to do that
    from deepchem.data.datasets import NumpyDataset  # import NumpyDataset

    dataset = NumpyDataset(np.squeeze(X_oversampled), y_oversampled)
    splitter = dc.splits.splitters.RandomSplitter()
    trainset, testset = splitter.train_test_split(dataset)
    X_oversampled, y_oversampled = ros.fit_resample(
        np.atleast_2d(X_embeddings[0]).T, labels)
    test_classifier = GraphConvModel(1,
                                     graph_conv_layers=[64, 64],
                                     dense_layer_size=128,
                                     dropout=0.5,
                                     model_dir='models',
                                     mode='classification',
                                     number_atom_features=75,
                                     n_classes=2,
                                     uncertainty=False,
                                     use_queue=False,
                                     tensorboard=True)
    test_classifier.fit(trainset, nb_epoch=10)
    dnn_preds = test_classifier.predict(testset)
    break
#    hp = dc.molnet.preset_hyper_parameters
#    param = hp.hps[ 'graphconvreg' ]
#    print(param['batch_size'])
#    g = tf.Graph()
#    graph_model = dc.nn.SequentialGraph( 75 )
#    graph_model.add( dc.nn.GraphConv( int(param['n_filters']), 75, activation='relu' ))
#    graph_model.add( dc.nn.BatchNormalization( epsilon=1e-5, mode=1 ))
Пример #14
0
graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer()
loader_train = dc.data.data_loader.CSVLoader(tasks=['ACTIVITY'],
                                             smiles_field="smiles",
                                             featurizer=graph_featurizer)
dataset_train = loader_train.featurize('./train.csv')

# In[3]:

loader_test = dc.data.data_loader.CSVLoader(tasks=['ACTIVITY'],
                                            smiles_field="smiles",
                                            featurizer=graph_featurizer)
dataset_test = loader_test.featurize('./test.csv')

# In[9]:

model = GraphConvModel(n_tasks=1, mode='regression', dropout=0.2)

model.fit(dataset_train, nb_epoch=1000)

# In[10]:

metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

print(model.evaluate(dataset_train, [metric]))

print(model.evaluate(dataset_test, [metric]))

# In[11]:

test_preds = model.predict(dataset_test)