예제 #1
0
  def test_tf_classification_overfit(self):
    """Test that tensorflow models can overfit simple classification datasets."""
    n_samples = 10
    n_features = 3
    n_tasks = 1
    n_classes = 2
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.zeros((n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = NumpyDataset(X, y, w, ids)

    verbosity = "high"
    classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
    tensorflow_model = TensorflowMultiTaskClassifier(
        n_tasks, n_features, self.model_dir, dropouts=[0.],
        learning_rate=0.0003, weight_init_stddevs=[.1],
        batch_size=n_samples, verbosity=verbosity)
    model = TensorflowModel(tensorflow_model, self.model_dir)

    # Fit trained model
    model.fit(dataset, nb_epoch=100)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])

    assert scores[classification_metric.name] > .9
예제 #2
0
  def test_tf_regression_overfit(self):
    """Test that TensorFlow models can overfit simple regression datasets."""
    n_samples = 10
    n_features = 3
    n_tasks = 1
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.zeros((n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    dataset = NumpyDataset(X, y, w, ids)

    verbosity = "high"
    regression_metric = Metric(metrics.mean_squared_error, verbosity=verbosity)
    # TODO(rbharath): This breaks with optimizer="momentum". Why?
    tensorflow_model = TensorflowMultiTaskRegressor(
        n_tasks, n_features, self.model_dir, dropouts=[0.],
        learning_rate=0.003, weight_init_stddevs=[np.sqrt(6)/np.sqrt(1000)],
        batch_size=n_samples, verbosity=verbosity)
    model = TensorflowModel(tensorflow_model, self.model_dir)

    # Fit trained model
    model.fit(dataset, nb_epoch=100)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])

    assert scores[regression_metric.name] < .1
예제 #3
0
  def test_tf_multitask_regression_overfit(self):
    """Test tf multitask overfits tiny data."""
    n_tasks = 10
    n_samples = 10
    n_features = 3
    n_classes = 2
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.zeros((n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
  
    dataset = NumpyDataset(X, y, w, ids)

    verbosity = "high"
    regression_metric = Metric(metrics.mean_squared_error, verbosity=verbosity,
                               task_averager=np.mean, mode="regression")
    tensorflow_model = TensorflowMultiTaskRegressor(
        n_tasks, n_features, self.model_dir, dropouts=[0.],
        learning_rate=0.0003, weight_init_stddevs=[.1],
        batch_size=n_samples, verbosity=verbosity)
    model = TensorflowModel(tensorflow_model, self.model_dir)

    # Fit trained model
    model.fit(dataset, nb_epoch=50)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])

    assert scores[regression_metric.name] < .1
예제 #4
0
    def test_tf_skewed_classification_overfit(self):
        """Test tensorflow models can overfit 0/1 datasets with few actives."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        #n_samples = 100
        n_samples = 100
        n_features = 3
        n_tasks = len(tasks)
        n_classes = 2

        # Generate dummy dataset
        np.random.seed(123)
        p = .05
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.binomial(1, p, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "layer_sizes": [1500],
            "dropouts": [.0],
            "learning_rate": 0.003,
            "momentum": .9,
            "batch_size": n_samples,
            "num_classification_tasks": 1,
            "num_classes": n_classes,
            "num_features": n_features,
            "weight_init_stddevs": [1.],
            "bias_init_consts": [1.],
            "nb_epoch": 200,
            "penalty": 0.0,
            "optimizer": "adam",
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        model = TensorflowModel(tasks,
                                task_types,
                                model_params,
                                self.model_dir,
                                tf_class=TensorflowMultiTaskClassifier,
                                verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .8
예제 #5
0
    def test_tf_multitask_classification_overfit(self):
        """Test tf multitask overfits tiny data."""
        n_tasks = 10
        tasks = ["task%d" % task for task in range(n_tasks)]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3
        n_classes = 2

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        #y = np.random.randint(n_classes, size=(n_samples, n_tasks))
        y = np.zeros((n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "layer_sizes": [1000],
            "dropouts": [.0],
            "learning_rate": 0.0003,
            "momentum": .9,
            "batch_size": n_samples,
            "num_classification_tasks": n_tasks,
            "num_classes": n_classes,
            "num_features": n_features,
            "weight_init_stddevs": [.1],
            "bias_init_consts": [1.],
            "nb_epoch": 100,
            "penalty": 0.0,
            "optimizer": "adam",
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.accuracy_score,
                                       verbosity=verbosity)
        model = TensorflowModel(tasks,
                                task_types,
                                model_params,
                                self.model_dir,
                                tf_class=TensorflowMultiTaskClassifier,
                                verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .9
예제 #6
0
    def test_tf_reload(self):
        """Test that tensorflow models can overfit simple classification datasets."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)
        n_classes = 2

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(n_classes, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "layer_sizes": [1000],
            "dropouts": [0.0],
            "learning_rate": 0.003,
            "momentum": 0.9,
            "batch_size": n_samples,
            "num_classification_tasks": 1,
            "num_classes": n_classes,
            "num_features": n_features,
            "weight_init_stddevs": [1.0],
            "bias_init_consts": [1.0],
            "nb_epoch": 100,
            "penalty": 0.0,
            "optimizer": "adam",
            "data_shape": dataset.get_data_shape(),
        }

        verbosity = "high"
        classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity)
        model = TensorflowModel(
            tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity
        )

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Load trained model
        reloaded_model = TensorflowModel(
            tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity
        )
        reloaded_model.reload()
        assert reloaded_model.eval_model._restored_model

        # Eval model on train
        transformers = []
        evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > 0.9
예제 #7
0
    def test_tf_regression_overfit(self):
        """Test that TensorFlow models can overfit simple regression datasets."""
        tasks = ["task0"]
        task_types = {task: "regression" for task in tasks}
        n_samples = 10
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.zeros((n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "layer_sizes": [1000],
            "dropouts": [.0],
            "learning_rate": 0.003,
            "momentum": .9,
            "batch_size": n_samples,
            "num_regression_tasks": 1,
            "num_features": n_features,
            "weight_init_stddevs": [np.sqrt(6) / np.sqrt(1000)],
            "bias_init_consts": [1.],
            "nb_epoch": 100,
            "penalty": 0.0,
            "optimizer": "momentum",
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.mean_squared_error,
                                   verbosity=verbosity)
        model = TensorflowModel(tasks,
                                task_types,
                                model_params,
                                self.model_dir,
                                tf_class=TensorflowMultiTaskRegressor,
                                verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])

        assert scores[regression_metric.name] < .1
예제 #8
0
  def test_tf_skewed_classification_overfit(self):
    """Test tensorflow models can overfit 0/1 datasets with few actives."""
    tasks = ["task0"]
    task_types = {task: "classification" for task in tasks}
    #n_samples = 100
    n_samples = 100
    n_features = 3
    n_tasks = len(tasks)
    n_classes = 2
    
    # Generate dummy dataset
    np.random.seed(123)
    p = .05
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
  
    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "layer_sizes": [1500],
      "dropouts": [.0],
      "learning_rate": 0.003,
      "momentum": .9,
      "batch_size": n_samples,
      "num_classification_tasks": 1,
      "num_classes": n_classes,
      "num_features": n_features,
      "weight_init_stddevs": [1.],
      "bias_init_consts": [1.],
      "nb_epoch": 200,
      "penalty": 0.0,
      "optimizer": "adam",
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    model = TensorflowModel(
        tasks, task_types, model_params, self.model_dir,
        tf_class=TensorflowMultiTaskClassifier,
        verbosity=verbosity)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])

    assert scores[classification_metric.name] > .8
예제 #9
0
  def test_tf_multitask_regression_overfit(self):
    """Test tf multitask overfits tiny data."""
    n_tasks = 10
    tasks = ["task%d" % task for task in range(n_tasks)]
    task_types = {task: "regression" for task in tasks}
    n_samples = 10
    n_features = 3
    n_classes = 2
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    #y = np.random.randint(n_classes, size=(n_samples, n_tasks))
    y = np.zeros((n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
  
    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "layer_sizes": [1000],
      "dropouts": [.0],
      "learning_rate": 0.0003,
      "momentum": .9,
      "batch_size": n_samples,
      "num_regression_tasks": n_tasks,
      "num_classes": n_classes,
      "num_features": n_features,
      "weight_init_stddevs": [.1],
      "bias_init_consts": [1.],
      "nb_epoch": 100,
      "penalty": 0.0,
      "optimizer": "adam",
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
    model = TensorflowModel(
        tasks, task_types, model_params, self.model_dir,
        tf_class=TensorflowMultiTaskRegressor,
        verbosity=verbosity)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])

    assert scores[regression_metric.name] > .9
예제 #10
0
    def test_tf_reload(self):
        """Test that tensorflow models can overfit simple classification datasets."""
        n_samples = 10
        n_features = 3
        n_tasks = 1
        n_classes = 2

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(n_classes, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = NumpyDataset(X, y, w, ids)

        verbosity = "high"
        classification_metric = Metric(metrics.accuracy_score,
                                       verbosity=verbosity)

        tensorflow_model = TensorflowMultiTaskClassifier(n_tasks,
                                                         n_features,
                                                         self.model_dir,
                                                         dropouts=[0.],
                                                         verbosity=verbosity)
        model = TensorflowModel(tensorflow_model, self.model_dir)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Load trained model
        reloaded_tensorflow_model = TensorflowMultiTaskClassifier(
            n_tasks,
            n_features,
            self.model_dir,
            dropouts=[0.],
            verbosity=verbosity)
        reloaded_model = TensorflowModel(reloaded_tensorflow_model,
                                         self.model_dir)
        reloaded_model.reload()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(reloaded_model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .6
예제 #11
0
  def test_tf_skewed_missing_classification_overfit(self):
    """TF, skewed data, few actives

    Test tensorflow models overfit 0/1 datasets with missing data and few
    actives. This is intended to be as close to singletask MUV datasets as
    possible.
    """
    n_samples = 5120
    n_features = 6
    n_tasks = 1
    n_classes = 2
    
    # Generate dummy dataset
    np.random.seed(123)
    p = .002
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    y_flat, w_flat = np.squeeze(y), np.squeeze(w)
    y_nonzero = y_flat[w_flat != 0]
    num_nonzero = np.count_nonzero(y_nonzero)
    weight_nonzero = len(y_nonzero)/num_nonzero
    w_flat[y_flat != 0] = weight_nonzero
    w = np.reshape(w_flat, (n_samples, n_tasks))
  
    dataset = NumpyDataset(X, y, w, ids)

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    tensorflow_model = TensorflowMultiTaskClassifier(
        n_tasks, n_features, self.model_dir, dropouts=[0.],
        learning_rate=0.003, weight_init_stddevs=[1.],
        batch_size=n_samples, verbosity=verbosity)
    model = TensorflowModel(tensorflow_model, self.model_dir)

    # Fit trained model
    model.fit(dataset, nb_epoch=50)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])

    assert scores[classification_metric.name] > .8
예제 #12
0
  def test_singletask_tf_mlp_ECFP_classification_API(self):
    """Straightforward test of Tensorflow singletask deepchem classification API."""
    n_features = 1024
    featurizer = CircularFingerprint(size=n_features)

    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    
    transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]

    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    classification_metrics = [Metric(metrics.roc_auc_score),
                              Metric(metrics.matthews_corrcoef),
                              Metric(metrics.recall_score),
                              Metric(metrics.accuracy_score)]

    tensorflow_model = TensorflowMultiTaskClassifier(
        len(tasks), n_features, self.model_dir)
    model = TensorflowModel(tensorflow_model, self.model_dir)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)
예제 #13
0
    def test_singletask_tf_mlp_ECFP_classification_API(self):
        """Straightforward test of Tensorflow singletask deepchem classification API."""
        splittype = "scaffold"
        output_transformers = []
        input_transformers = []
        task_type = "classification"

        featurizer = CircularFingerprint(size=1024)

        tasks = ["outcome"]
        task_type = "classification"
        task_types = {task: task_type for task in tasks}
        input_file = os.path.join(self.current_dir,
                                  "example_classification.csv")

        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        input_transformers = []
        output_transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        transformers = input_transformers + output_transformers

        for dataset in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(dataset)

        model_params = {
            "batch_size": 2,
            "num_classification_tasks": 1,
            "num_features": 1024,
            "layer_sizes": [1024],
            "weight_init_stddevs": [1.],
            "bias_init_consts": [0.],
            "dropouts": [.5],
            "num_classes": 2,
            "nb_epoch": 1,
            "penalty": 0.0,
            "optimizer": "adam",
            "learning_rate": .001,
            "data_shape": train_dataset.get_data_shape()
        }
        classification_metrics = [
            Metric(metrics.roc_auc_score),
            Metric(metrics.matthews_corrcoef),
            Metric(metrics.recall_score),
            Metric(metrics.accuracy_score)
        ]

        model = TensorflowModel(tasks,
                                task_types,
                                model_params,
                                self.model_dir,
                                tf_class=TensorflowMultiTaskClassifier)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        evaluator = Evaluator(model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)
예제 #14
0
train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                   w_train, ids_train, pdbbind_tasks)
valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                   w_valid, ids_valid, pdbbind_tasks)

classification_metric = Metric(metrics.pearson_r2_score, verbosity=verbosity,
                               mode="regression")

n_features = dataset.get_data_shape()[0]
tensorflow_model = TensorflowMultiTaskRegressor(
    len(pdbbind_tasks), n_features, model_dir, dropouts=[.25],
    learning_rate=0.0003, weight_init_stddevs=[.1],
    batch_size=64, verbosity=verbosity)
model = TensorflowModel(tensorflow_model, model_dir)

# Fit trained model
model.fit(train_dataset, nb_epoch=20)
model.save()

train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance([classification_metric])

print("Train scores")
print(train_scores)

valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity)
valid_scores = valid_evaluator.compute_model_performance([classification_metric])

print("Validation scores")
print(valid_scores)
예제 #15
0
    def test_tf_skewed_missing_classification_overfit(self):
        """TF, skewed data, few actives

    Test tensorflow models overfit 0/1 datasets with missing data and few
    actives. This is intended to be as close to singletask MUV datasets as
    possible.
    """

        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        n_samples = 5120
        n_features = 6
        n_tasks = len(tasks)
        n_classes = 2

        # Generate dummy dataset
        np.random.seed(123)
        p = .002
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.binomial(1, p, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))
        print("np.count_nonzero(y)")
        print(np.count_nonzero(y))
        ##### DEBUG
        y_flat, w_flat = np.squeeze(y), np.squeeze(w)
        y_nonzero = y_flat[w_flat != 0]
        num_nonzero = np.count_nonzero(y_nonzero)
        weight_nonzero = len(y_nonzero) / num_nonzero
        print("weight_nonzero")
        print(weight_nonzero)
        w_flat[y_flat != 0] = weight_nonzero
        w = np.reshape(w_flat, (n_samples, n_tasks))
        print("np.amin(w), np.amax(w)")
        print(np.amin(w), np.amax(w))
        ##### DEBUG

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "layer_sizes": [1200],
            "dropouts": [.0],
            "learning_rate": 0.003,
            "momentum": .9,
            "batch_size": 75,
            "num_classification_tasks": 1,
            "num_classes": n_classes,
            "num_features": n_features,
            "weight_init_stddevs": [1.],
            "bias_init_consts": [1.],
            "nb_epoch": 250,
            "penalty": 0.0,
            "optimizer": "adam",
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        model = TensorflowModel(tasks,
                                task_types,
                                model_params,
                                self.model_dir,
                                tf_class=TensorflowMultiTaskClassifier,
                                verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .8
예제 #16
0
# Do train/valid split.
train_dataset, valid_dataset = tox21_datasets

# Fit models
classification_metric = Metric(metrics.roc_auc_score, np.mean,
                               verbosity=verbosity,
                               mode="classification")

tensorflow_model = TensorflowMultiTaskClassifier(
    len(tox21_tasks), n_features, model_dir, dropouts=[.25],
    learning_rate=0.0003, weight_init_stddevs=[1.],
    batch_size=32, verbosity=verbosity)
model = TensorflowModel(tensorflow_model, model_dir)

# Fit trained model
model.fit(train_dataset)
model.save()

train_evaluator = Evaluator(model, train_dataset, transformers,
                            verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance(
    [classification_metric])

print("Train scores")
print(train_scores)

valid_evaluator = Evaluator(model, valid_dataset, transformers,
                            verbosity=verbosity)
valid_scores = valid_evaluator.compute_model_performance([classification_metric])

print("Validation scores")
예제 #17
0
  def test_tf_skewed_missing_classification_overfit(self):
    """TF, skewed data, few actives

    Test tensorflow models overfit 0/1 datasets with missing data and few
    actives. This is intended to be as close to singletask MUV datasets as
    possible.
    """
    
    tasks = ["task0"]
    task_types = {task: "classification" for task in tasks}
    n_samples = 5120
    n_features = 6
    n_tasks = len(tasks)
    n_classes = 2
    
    # Generate dummy dataset
    np.random.seed(123)
    p = .002
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.binomial(1, p, size=(n_samples, n_tasks))
    w = np.ones((n_samples, n_tasks))
    y_flat, w_flat = np.squeeze(y), np.squeeze(w)
    y_nonzero = y_flat[w_flat != 0]
    num_nonzero = np.count_nonzero(y_nonzero)
    weight_nonzero = len(y_nonzero)/num_nonzero
    w_flat[y_flat != 0] = weight_nonzero
    w = np.reshape(w_flat, (n_samples, n_tasks))
  
    dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

    model_params = {
      "layer_sizes": [1200],
      "dropouts": [.0],
      "learning_rate": 0.003,
      "momentum": .9,
      "batch_size": 75,
      "num_classification_tasks": 1,
      "num_classes": n_classes,
      "num_features": n_features,
      "weight_init_stddevs": [1.],
      "bias_init_consts": [1.],
      "nb_epoch": 250,
      "penalty": 0.0,
      "optimizer": "adam",
      "data_shape": dataset.get_data_shape()
    }

    verbosity = "high"
    classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
    model = TensorflowModel(
        tasks, task_types, model_params, self.model_dir,
        tf_class=TensorflowMultiTaskClassifier,
        verbosity=verbosity)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([classification_metric])

    assert scores[classification_metric.name] > .8
예제 #18
0
train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                   w_train, ids_train, pdbbind_tasks)
valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                   w_valid, ids_valid, pdbbind_tasks)

classification_metric = Metric(metrics.pearson_r2_score, verbosity=verbosity,
                               mode="regression")

n_features = dataset.get_data_shape()[0]
tensorflow_model = TensorflowMultiTaskRegressor(
    len(pdbbind_tasks), n_features, model_dir, dropouts=[.25],
    learning_rate=0.0003, weight_init_stddevs=[.1],
    batch_size=64, verbosity=verbosity)
model = TensorflowModel(tensorflow_model, model_dir)

# Fit trained model
model.fit(train_dataset, nb_epoch=20)
model.save()

train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance([classification_metric])

print("Train scores")
print(train_scores)

valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity)
valid_scores = valid_evaluator.compute_model_performance([classification_metric])

print("Validation scores")
print(valid_scores)
예제 #19
0
    "num_classes": 2,
    "penalty": .0,
    "optimizer": "momentum",
    "learning_rate": .001,
    "momentum": .9,
}   


if os.path.exists(model_dir):
  shutil.rmtree(model_dir)
os.makedirs(model_dir)
model = TensorflowModel(pcba_tasks, pcba_task_types, params_dict, model_dir,
                        tf_class=TensorflowMultiTaskClassifier,
                        verbosity=verbosity)

# Fit trained model
model.fit(train_dataset)
model.save()

train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance([classification_metric])

print("Train scores")
print(train_scores)

valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity)
valid_scores = valid_evaluator.compute_model_performance([classification_metric])

print("Validation scores")
print(valid_scores)
예제 #20
0
  def test_singletask_tf_mlp_ECFP_classification_API(self):
    """Straightforward test of Tensorflow singletask deepchem classification API."""
    splittype = "scaffold"
    output_transformers = []
    input_transformers = []
    task_type = "classification"

    featurizer = CircularFingerprint(size=1024)

    tasks = ["outcome"]
    task_type = "classification"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example_classification.csv")

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    
    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers

    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    model_params = {
      "batch_size": 2,
      "num_classification_tasks": 1,
      "num_features": 1024,
      "layer_sizes": [1024],
      "weight_init_stddevs": [1.],
      "bias_init_consts": [0.],
      "dropouts": [.5],
      "num_classes": 2,
      "nb_epoch": 1,
      "penalty": 0.0,
      "optimizer": "adam",
      "learning_rate": .001,
      "data_shape": train_dataset.get_data_shape()
    }
    classification_metrics = [Metric(metrics.roc_auc_score),
                              Metric(metrics.matthews_corrcoef),
                              Metric(metrics.recall_score),
                              Metric(metrics.accuracy_score)]

    model = TensorflowModel(
        tasks, task_types, model_params, self.model_dir,
        tf_class=TensorflowMultiTaskClassifier)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)