Exemplo n.º 1
0
    def test_keras_skewed_classification_overfit(self):
        """Test keras models can overfit 0/1 datasets with few actives."""
        tasks = ["task0"]
        task_types = {task: "classification" for task in tasks}
        n_samples = 100
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        np.random.seed(123)
        p = .05
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.binomial(1, p, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "nb_hidden": 1000,
            "activation": "relu",
            "dropout": .0,
            "learning_rate": .15,
            "momentum": .9,
            "nesterov": False,
            "decay": 1e-4,
            "batch_size": n_samples,
            "nb_epoch": 200,
            "init": "glorot_uniform",
            "nb_layers": 1,
            "batchnorm": False,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        model = MultiTaskDNN(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        assert scores[classification_metric.name] > .9
Exemplo n.º 2
0
    def test_keras_reload(self):
        """Test that trained keras models can be reloaded correctly."""
        g = tf.Graph()
        sess = tf.Session(graph=g)
        K.set_session(sess)
        with g.as_default():
            tasks = ["task0"]
            task_types = {task: "classification" for task in tasks}
            n_samples = 10
            n_features = 3
            n_tasks = len(tasks)

            # Generate dummy dataset
            np.random.seed(123)
            ids = np.arange(n_samples)
            X = np.random.rand(n_samples, n_features)
            y = np.random.randint(2, size=(n_samples, n_tasks))
            w = np.ones((n_samples, n_tasks))

            dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

            model_params = {
                "nb_hidden": 1000,
                "activation": "relu",
                "dropout": 0.0,
                "learning_rate": 0.15,
                "momentum": 0.9,
                "nesterov": False,
                "decay": 1e-4,
                "batch_size": n_samples,
                "nb_epoch": 200,
                "init": "glorot_uniform",
                "nb_layers": 1,
                "batchnorm": False,
                "data_shape": dataset.get_data_shape(),
            }

            verbosity = "high"
            classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity)
            model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity)

            # Fit trained model
            model.fit(dataset)
            model.save()

            # Load trained model
            reloaded_model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity)
            reloaded_model.reload()

            # Eval model on train
            transformers = []
            evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity)
            scores = evaluator.compute_model_performance([classification_metric])

            assert scores[classification_metric.name] > 0.9
Exemplo n.º 3
0
  def test_multitask_keras_mlp_ECFP_classification_API(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    g = tf.Graph()
    sess = tf.Session(graph=g)
    K.set_session(sess)
    with g.as_default():
      task_type = "classification"
      # TODO(rbharath): There should be some automatic check to ensure that all
      # required model_params are specified.
      # TODO(rbharath): Turning off dropout to make tests behave.
      model_params = {"nb_hidden": 10, "activation": "relu",
                      "dropout": .0, "learning_rate": .01,
                      "momentum": .9, "nesterov": False,
                      "decay": 1e-4, "batch_size": 5,
                      "nb_epoch": 2, "init": "glorot_uniform",
                      "nb_layers": 1, "batchnorm": False}

      input_file = os.path.join(self.current_dir, "multitask_example.csv")
      tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
               "task7", "task8", "task9", "task10", "task11", "task12",
               "task13", "task14", "task15", "task16"]
      task_types = {task: task_type for task in tasks}

      featurizer = CircularFingerprint(size=1024)

      loader = DataLoader(tasks=tasks,
                          smiles_field=self.smiles_field,
                          featurizer=featurizer,
                          verbosity="low")
      dataset = loader.featurize(input_file, self.data_dir)
      splitter = ScaffoldSplitter()
      train_dataset, test_dataset = splitter.train_test_split(
          dataset, self.train_dir, self.test_dir)

      transformers = []
      model_params["data_shape"] = train_dataset.get_data_shape()
      classification_metrics = [Metric(metrics.roc_auc_score),
                                Metric(metrics.matthews_corrcoef),
                                Metric(metrics.recall_score),
                                Metric(metrics.accuracy_score)]
      
      model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir)

      # Fit trained model
      model.fit(train_dataset)
      model.save()

      # Eval model on train
      evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
      _ = evaluator.compute_model_performance(classification_metrics)

      # Eval model on test
      evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
      _ = evaluator.compute_model_performance(classification_metrics)
Exemplo n.º 4
0
    def test_keras_multitask_regression_overfit(self):
        """Test keras multitask overfits tiny data."""
        n_tasks = 10
        tasks = ["task%d" % task for task in range(n_tasks)]
        task_types = {task: "regression" for task in tasks}
        n_samples = 10
        n_features = 3

        # Generate dummy dataset
        np.random.seed(123)
        ids = np.arange(n_samples)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

        model_params = {
            "nb_hidden": 1000,
            "activation": "relu",
            "dropout": .0,
            "learning_rate": .15,
            "momentum": .9,
            "nesterov": False,
            "decay": 1e-4,
            "batch_size": n_samples,
            "nb_epoch": 200,
            "init": "glorot_uniform",
            "nb_layers": 1,
            "batchnorm": False,
            "data_shape": dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        model = MultiTaskDNN(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             verbosity=verbosity)

        # Fit trained model
        model.fit(dataset)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model,
                              dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])

        assert scores[regression_metric.name] > .9
Exemplo n.º 5
0
  def test_keras_multitask_regression_overfit(self):
    """Test keras multitask overfits tiny data."""
    g = tf.Graph()
    sess = tf.Session(graph=g)
    K.set_session(sess)
    with g.as_default():
      n_tasks = 10
      tasks = ["task%d" % task for task in range(n_tasks)]
      task_types = {task: "regression" for task in tasks}
      n_samples = 10
      n_features = 3
      
      # Generate dummy dataset
      np.random.seed(123)
      ids = np.arange(n_samples)
      X = np.random.rand(n_samples, n_features)
      y = np.random.randint(2, size=(n_samples, n_tasks))
      w = np.ones((n_samples, n_tasks))
    
      dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks)

      model_params = {
          "nb_hidden": 1000,
          "activation": "relu",
          "dropout": .0,
          "learning_rate": .15,
          "momentum": .9,
          "nesterov": False,
          "decay": 1e-4,
          "batch_size": n_samples,
          "nb_epoch": 200,
          "init": "glorot_uniform",
          "nb_layers": 1,
          "batchnorm": False,
          "data_shape": dataset.get_data_shape()
      }

      verbosity = "high"
      regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
      model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir,
                           verbosity=verbosity)

      # Fit trained model
      model.fit(dataset)
      model.save()

      # Eval model on train
      transformers = []
      evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
      scores = evaluator.compute_model_performance([regression_metric])

      assert scores[regression_metric.name] > .9
Exemplo n.º 6
0
    "batch_size": 64,
    "nb_epoch": 10,
    "init": "glorot_uniform",
    "nb_layers": 1,
    "batchnorm": False,
    "data_shape": train_dataset.get_data_shape()
}

if os.path.exists(model_dir):
  shutil.rmtree(model_dir)
os.makedirs(model_dir)

model = MultiTaskDNN(muv_tasks, muv_task_types, params_dict, model_dir,
                    verbosity=verbosity)

# Fit trained model
model.fit(train_dataset)
model.save()

train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity)
train_scores = train_evaluator.compute_model_performance([classification_metric])

print("Train scores")
print(train_scores)

valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity)
valid_scores = valid_evaluator.compute_model_performance([classification_metric])

print("Validation scores")
print(valid_scores)
Exemplo n.º 7
0
    def test_multitask_keras_mlp_ECFP_classification_API(self):
        """Straightforward test of Keras multitask deepchem classification API."""
        from deepchem.models.keras_models.fcnet import MultiTaskDNN
        task_type = "classification"
        # TODO(rbharath): There should be some automatic check to ensure that all
        # required model_params are specified.
        model_params = {
            "nb_hidden": 10,
            "activation": "relu",
            "dropout": .5,
            "learning_rate": .01,
            "momentum": .9,
            "nesterov": False,
            "decay": 1e-4,
            "batch_size": 5,
            "nb_epoch": 2,
            "init": "glorot_uniform",
            "nb_layers": 1,
            "batchnorm": False
        }

        input_file = os.path.join(self.current_dir, "multitask_example.csv")
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]
        task_types = {task: task_type for task in tasks}

        featurizer = CircularFingerprint(size=1024)

        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)
        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        transformers = []
        model_params["data_shape"] = train_dataset.get_data_shape()
        classification_metrics = [
            Metric(metrics.roc_auc_score),
            Metric(metrics.matthews_corrcoef),
            Metric(metrics.recall_score),
            Metric(metrics.accuracy_score)
        ]

        model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        evaluator = Evaluator(model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)