Пример #1
0
    def test_set_optimizer(self):
        n_data_points = 20
        n_features = 2
        X = np.random.rand(n_data_points, n_features)
        y = [[0, 1] for x in range(n_data_points)]
        dataset = NumpyDataset(X, y)
        features = Feature(shape=(None, n_features))
        dense = Dense(out_channels=2, in_layers=[features])
        output = SoftMax(in_layers=[dense])
        label = Label(shape=(None, 2))
        smce = SoftMaxCrossEntropy(in_layers=[label, dense])
        loss = ReduceMean(in_layers=[smce])
        tg = dc.models.TensorGraph(learning_rate=0.01, use_queue=False)
        tg.add_output(output)
        tg.set_loss(loss)
        global_step = tg.get_global_step()
        learning_rate = ExponentialDecay(initial_rate=0.1,
                                         decay_rate=0.96,
                                         decay_steps=100000)
        tg.set_optimizer(GradientDescent(learning_rate=learning_rate))
        tg.fit(dataset, nb_epoch=1000)
        prediction = np.squeeze(tg.predict_on_batch(X))
        tg.save()

        tg1 = TensorGraph.load_from_dir(tg.model_dir)
        prediction2 = np.squeeze(tg1.predict_on_batch(X))
        assert_true(np.all(np.isclose(prediction, prediction2, atol=0.01)))
Пример #2
0
    def test_ANI_multitask_regression_overfit(self):
        """Test ANI-1 regression overfits tiny data."""
        input_file = os.path.join(self.current_dir, "example_DTNN.mat")
        np.random.seed(123)
        tf.set_random_seed(123)
        dataset = scipy.io.loadmat(input_file)
        X = np.concatenate([np.expand_dims(dataset['Z'], 2), dataset['R']],
                           axis=2)
        X = X[:, :13, :]
        y = dataset['T']
        w = np.ones_like(y)
        dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids=None)
        regression_metric = dc.metrics.Metric(dc.metrics.pearson_r2_score,
                                              mode="regression")
        n_tasks = y.shape[1]
        batch_size = 10

        transformers = [
            dc.trans.NormalizationTransformer(transform_y=True,
                                              dataset=dataset),
        ]

        for transformer in transformers:
            dataset = transformer.transform(dataset)

        model = dc.models.ANIRegression(n_tasks,
                                        13,
                                        atom_number_cases=[1, 6, 7, 8],
                                        batch_size=batch_size,
                                        learning_rate=ExponentialDecay(
                                            0.01, 0.7, 100),
                                        mode="regression")

        # Fit trained model
        model.fit(dataset, nb_epoch=500)

        # Eval model on train
        scores = model.evaluate(dataset, [regression_metric],
                                transformers[0:1])

        assert scores[regression_metric.name] > .7
train_smiles = train_dataset.ids

tokens = set()
for s in train_smiles:
  tokens = tokens.union(set(s))
tokens = sorted(list(tokens))
max_length = max(len(s) for s in train_smiles)

#training
from deepchem.models.tensorgraph.optimizers import Adam, ExponentialDecay
from deepchem.models.tensorgraph.models.seqtoseq import AspuruGuzikAutoEncoder
#the encoder is a CNN and the decoder is a GRU
model = AspuruGuzikAutoEncoder(tokens, max_length, model_dir='vae')

batches_per_epoch = len(train_smiles)/model.batch_size
learning_rate = ExponentialDecay(0.001, 0.95, batches_per_epoch)
model.set_optimizer(Adam(learning_rate=learning_rate))

def generate_sequences(epochs): 
  for i in range(epochs):
    for s in train_smiles: 
      yield (s, s)
model.summary()
model.fit_sequences(generate_sequences(1))

#check that the molecules are valid
import numpy as np
from rdkit import Chem
predictions = model.predict_from_embeddings(np.random.normal(size=(1000,196))) 
molecules = []
for p in predictions:
Пример #4
0
tokens = sorted(list(tokens))

print(tokens[0:5])

max_length = max(len(s) for s in train_smiles)
model = dc.models.SeqToSeq(tokens,
                           tokens,
                           max_length,
                           encoder_layers=2,
                           decoder_layers=2,
                           embedding_dimension=256,
                           model_dir='fingerprint')
batches_per_epoch = len(train_smiles) / model.batch_size

model.set_optimizer(
    Adam(learning_rate=ExponentialDecay(0.004, 0.9, batches_per_epoch)))


def generate_sequences(epochs):
    for i in range(epochs):
        for s in train_smiles:
            yield (s, s)


model.fit_sequences(generate_sequences(40))

predicted = model.predict_from_sequences(valid_smiles[:500])
count = 0
for s, p in zip(valid_smiles[:500], predicted):
    if ''.join(p) == s:
        count += 1
Пример #5
0
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.metrics import to_one_hot
from deepchem.utils.evaluate import Evaluator
from deepchem.models import MultiTaskClassifier
from deepchem.models.tensorgraph.optimizers import ExponentialDecay

np.random.seed(123)

pcba_tasks, pcba_datasets, transformers = load_pcba()
(train_dataset, valid_dataset, test_dataset) = pcba_datasets

metric = Metric(metrics.roc_auc_score, np.mean, mode="classification")

n_features = train_dataset.get_data_shape()[0]
rate = ExponentialDecay(0.001, 0.8, 1000)
model = MultiTaskClassifier(
    len(pcba_tasks),
    n_features,
    dropouts=[.25],
    learning_rate=rate,
    weight_init_stddevs=[.1],
    batch_size=64)

# Fit trained model
model.fit(train_dataset)
model.save()

train_evaluator = Evaluator(model, train_dataset, transformers)
train_scores = train_evaluator.compute_model_performance([metric])
Пример #6
0
# Fit models
metric = [
    dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]

# Batch size of models
batch_size = 50
n_embedding = 30
n_distance = 51
distance_min = -1.
distance_max = 9.2
n_hidden = 15

rate = ExponentialDecay(0.0001, 0.97, 5000)
model = dc.models.DTNNModel(len(tasks),
                            n_embedding=n_embedding,
                            n_hidden=n_hidden,
                            n_distance=n_distance,
                            distance_min=distance_min,
                            distance_max=distance_max,
                            output_activation=False,
                            batch_size=batch_size,
                            learning_rate=rate,
                            use_queue=False,
                            mode="regression")
#model.restore()

# Fit trained model
model.fit(train_dataset, nb_epoch=3000)
Пример #7
0
  tokens = tokens.union(set(c for c in s))
tokens = sorted(list(tokens))

print(tokens[0:5])

max_length = max(len(s) for s in train_smiles)
model = dc.models.SeqToSeq(tokens,
                           tokens,
                           max_length,
                           encoder_layers=2,
                           decoder_layers=2,
                           embedding_dimension=256,
                           model_dir='fingerprint')
batches_per_epoch = len(train_smiles)/model.batch_size

model.set_optimizer(Adam(learning_rate=ExponentialDecay(0.004, 0.9, batches_per_epoch)))

def generate_sequences(epochs):
  for i in range(epochs):
    for s in train_smiles:
      yield (s, s)

model.fit_sequences(generate_sequences(40))


predicted = model.predict_from_sequences(valid_smiles[:500])
count = 0
for s,p in zip(valid_smiles[:500], predicted):
  if ''.join(p) == s:
    count += 1
print('reproduced', count, 'of 500 validation SMILES strings')
Пример #8
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--img_spec", default="std", help="Image specification to load")
    parser.add_argument("--early_stopping_epoch", default=10, type=int, help="Number of epochs to check early stopping after for")
    parser.add_argument("--use_augment", action='store_true', help="Whether to perform real-time augmentation")
    parser.add_argument("--base_filters", default=16, type=int, help="Number of base filters to use")
    parser.add_argument("--inception_per_block", default=3, type=int, help="Number of inception layers per block")
    parser.add_argument("--batch_size", default=32, type=int, help="Batch size used for training")
    parser.add_argument("--learning_rate", default=1e-4, type=float, help="Learning rate used.")
    parser.add_argument("--dataset", default="tox21", help="Dataset to train on.")

    args = parser.parse_args()

    layers_per_block = args.inception_per_block
    inception_blocks = {"A": layers_per_block, "B": layers_per_block, "C": layers_per_block}

    mode = get_task_mode(args.dataset)

    DIRNAME = os.path.join(os.environ.get("SCRATCH", "./"), "deepchem-data")
    load_fn = loaders[args.dataset]
    tasks, dataset, transformers = load_fn(featurizer="smiles2img", data_dir=DIRNAME, save_dir=DIRNAME, img_spec=args.img_spec, split="stratified")

    metric_type = metric_types[args.dataset]

    task_averager = np.mean
    if len(tasks) == 1:
        task_averager = None

    metric = dc.metrics.Metric(metric_type, task_averager=task_averager, mode=mode, verbose=False)
    train, valid, test = dataset

    # Setup directory for experiment
    exp_name = dt.now().strftime("%d-%m-%Y--%H-%M-%S")
    hparams_dir = "filters_{}_blocklayers_{}_imgspec_{}".format(args.base_filters, layers_per_block, args.img_spec)
    model_dir_1 = os.path.join(DIRNAME, args.dataset, "chemception", hparams_dir, exp_name)

    # Optimizer and logging
    optimizer = RMSProp(learning_rate=args.learning_rate)

    logger.info("Dataset used: {}".format(args.dataset))
    logger.info("Args used: {}".format(args))
    logger.info("Num_tasks: {}".format(len(tasks)))

    ###### TRAINING FIRST PART WITH CONSTANT LEARNING RATE ###############
    model = ChemCeption(n_tasks=len(tasks), img_spec=args.img_spec,
                        inception_blocks=inception_blocks,
                        base_filters=args.base_filters, augment=args.use_augment,
                        model_dir=model_dir_1, mode=mode,
                        n_classes=2, batch_size=args.batch_size,
                        optimizer=optimizer, tensorboard=True,
                        tensorboard_log_frequency=100)
    model._ensure_built()

    train, valid, test = dataset

    logger.info("Created model dir at {}".format(model_dir_1))
    best_models_dir_1 = os.path.join(DIRNAME, args.dataset, "chemception", hparams_dir, "best-models", exp_name)
    logger.info("Saving best model so far")
    model.save_checkpoint(model_dir=best_models_dir_1)

    loss_old = compute_loss_on_valid(valid, model, tasks, mode=mode)

    train_scores = model.evaluate(train, [metric], [])
    valid_scores = model.evaluate(valid, [metric], [])
    test_scores = model.evaluate(test, [metric], [])

    logger.info("Train-{}: {}".format(metric.name, train_scores[metric.name]))
    logger.info("Valid-{}: {}".format(metric.name, valid_scores[metric.name]))
    logger.info("Test-{}: {}".format(metric.name, test_scores[metric.name]))

    for rep_num in range(2):
        logger.info("Training model for {} epochs.".format(args.early_stopping_epoch))
        model.fit(train, nb_epoch=args.early_stopping_epoch, checkpoint_interval=0)
        loss_new = compute_loss_on_valid(valid, model, tasks, mode=mode, verbose=False)

        train_scores = model.evaluate(train, [metric], [])
        valid_scores = model.evaluate(valid, [metric], [])
        test_scores = model.evaluate(test, [metric], [])

        logger.info("Train-{}: {}".format(metric.name, train_scores[metric.name]))
        logger.info("Valid-{}: {}".format(metric.name, valid_scores[metric.name]))
        logger.info("Test-{}: {}".format(metric.name, test_scores[metric.name]))

        logger.info("Computed loss on validation set after {} epochs: {}".format(args.early_stopping_epoch, loss_new))
        if loss_new > loss_old:
            logger.info("No improvement in validation loss. Enforcing early stopping.")
            break

        logger.info("Saving best model so far")
        model.save_checkpoint(model_dir=best_models_dir_1)
        loss_old = loss_new

    ######### TRAINING SECOND PART WITH DECAYING LEARNING RATE

    # Optimizer and logging
    decay_steps = args.early_stopping_epoch * train.y.shape[0] // args.batch_size
    logger.info("Decay steps: {}".format(decay_steps))

    lr = ExponentialDecay(initial_rate=args.learning_rate, decay_rate=0.92, decay_steps=decay_steps, staircase=True)
    optimizer = RMSProp(learning_rate=lr)

    # Setup directory for experiment
    exp_name = dt.now().strftime("%d-%m-%Y--%H-%M-%S")
    hparams_dir = "filters_{}_blocklayers_{}_imgspec_{}".format(args.base_filters, layers_per_block, args.img_spec)

    new_model = ChemCeption(n_tasks=len(tasks), img_spec=args.img_spec,
                            inception_blocks=inception_blocks,
                            base_filters=args.base_filters, augment=args.use_augment,
                            model_dir=model_dir_1, mode=mode,
                            n_classes=2, batch_size=args.batch_size,
                            optimizer=optimizer, tensorboard=True,
                            tensorboard_log_frequency=100)
    new_model.restore(model_dir=best_models_dir_1)

    best_models_dir_2 = os.path.join(DIRNAME, args.dataset, "chemception", hparams_dir, "best-models", exp_name)
    logger.info("Created best model dir for second stage at {}".format(best_models_dir_2))

    loss_old = compute_loss_on_valid(valid, new_model, tasks, mode=mode)

    for rep_num in range(2):
        logger.info("Training model for {} epochs.".format(args.early_stopping_epoch))
        new_model.fit(train, nb_epoch=args.early_stopping_epoch, checkpoint_interval=0)
        loss_new = compute_loss_on_valid(valid, new_model, tasks, mode=mode, verbose=False)

        train_scores = new_model.evaluate(train, [metric], [])
        valid_scores = new_model.evaluate(valid, [metric], [])
        test_scores = new_model.evaluate(test, [metric], [])

        logger.info("Train-{}: {}".format(metric.name, train_scores[metric.name]))
        logger.info("Valid-{}: {}".format(metric.name, valid_scores[metric.name]))
        logger.info("Test-{}: {}".format(metric.name, test_scores[metric.name]))

        logger.info("Computed loss on validation set after {} epochs: {}".format(args.early_stopping_epoch, loss_new))
        if loss_new > loss_old:
            logger.info("No improvement in validation loss. Enforcing early stopping.")
            break

        logger.info("Saving best model so far")
        new_model.save_checkpoint(model_dir=best_models_dir_2)
        loss_old = loss_new