Exemplo n.º 1
0
  def test_drop(self):
    """Test on dataset where RDKit fails on some strings."""
    # Set some global variables up top
    reload = True
    verbosity = "high"
    len_full = 25

    current_dir = os.path.dirname(os.path.realpath(__file__))
    data_dir = os.path.join(self.base_dir, "dataset")
    model_dir = os.path.join(self.base_dir, "model")

    print("About to load emols dataset.")
    dataset_file = os.path.join(
        current_dir, "mini_emols.csv")

    # Featurize emols dataset
    print("About to featurize datasets.")
    featurizer = CircularFingerprint(size=1024)
    emols_tasks = ['activity']

    loader = DataLoader(tasks=emols_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(dataset_file, data_dir, debug=True, logging=False)

    X, y, w, ids = dataset.to_numpy()
    print("ids.shape, X.shape, y.shape, w.shape")
    print(ids.shape, X.shape, y.shape, w.shape)
    assert len(X) == len(y) == len(w) == len(ids)
Exemplo n.º 2
0
    def test_subset(self):
        """Tests that subsetting of datasets works."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        data_dir = os.path.join(self.base_dir, "dataset")
        subset_dir = os.path.join(self.base_dir, "subset")

        dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir, shard_size=2)

        shard_nums = [1, 2]

        orig_ids = dataset.get_ids()
        _, _, _, ids_1 = dataset.get_shard(1)
        _, _, _, ids_2 = dataset.get_shard(2)

        subset = dataset.subset(subset_dir, shard_nums)
        after_ids = dataset.get_ids()

        assert len(subset) == 4
        assert sorted(subset.get_ids()) == sorted(np.concatenate([ids_1, ids_2]))
        assert list(orig_ids) == list(after_ids)
Exemplo n.º 3
0
    def test_subset(self):
        """Tests that subsetting of datasets works."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        data_dir = os.path.join(self.base_dir, "dataset")
        subset_dir = os.path.join(self.base_dir, "subset")

        dataset_file = os.path.join(current_dir,
                                    "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir, shard_size=2)

        shard_nums = [1, 2]

        orig_ids = dataset.ids
        _, _, _, ids_1 = dataset.get_shard(1)
        _, _, _, ids_2 = dataset.get_shard(2)

        subset = dataset.subset(subset_dir, shard_nums)
        after_ids = dataset.ids

        assert len(subset) == 4
        assert sorted(subset.ids) == sorted(np.concatenate([ids_1, ids_2]))
        assert list(orig_ids) == list(after_ids)
Exemplo n.º 4
0
  def test_samples_move(self):
    """Test that featurized samples can be moved and reloaded."""
    verbosity = "high"
    data_dir = os.path.join(self.base_dir, "data")
    moved_data_dir = os.path.join(self.base_dir, "moved_data")
    dataset_file = os.path.join(
        self.current_dir, "example.csv")

    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = DataLoader(tasks=tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    featurized_dataset = loader.featurize(
        dataset_file, data_dir)
    n_dataset = len(featurized_dataset)
  
    # Now perform move
    shutil.move(data_dir, moved_data_dir)

    moved_featurized_dataset = Dataset(
        data_dir=moved_data_dir, reload=True)

    assert len(moved_featurized_dataset) == n_dataset
Exemplo n.º 5
0
  def random_test_train_valid_test_split(self):
    """Test of singletask RF ECFP regression API."""
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    featurizer = CircularFingerprint(size=1024)

    input_file = os.path.join(self.current_dir, input_file)
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")

    dataset = loader.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = RandomSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1
Exemplo n.º 6
0
  def test_move_load(self):
    """Test that datasets can be moved and loaded."""
    verbosity = "high"
    current_dir = os.path.dirname(os.path.realpath(__file__))
    data_dir = os.path.join(self.base_dir, "data")
    moved_data_dir = os.path.join(self.base_dir, "moved_data")
    dataset_file = os.path.join(
        current_dir, "../../models/tests/example.csv")

    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = DataLoader(tasks=tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(dataset_file, data_dir)

    X, y, w, ids = dataset.to_numpy()
    shutil.move(data_dir, moved_data_dir)

    moved_dataset = Dataset(
        moved_data_dir, reload=reload)

    X_moved, y_moved, w_moved, ids_moved = moved_dataset.to_numpy()

    np.testing.assert_allclose(X, X_moved)
    np.testing.assert_allclose(y, y_moved)
    np.testing.assert_allclose(w, w_moved)
    np.testing.assert_array_equal(ids, ids_moved)
Exemplo n.º 7
0
    def random_test_train_valid_test_split_from_sdf(self):
        """Test of singletask CoulombMatrixEig regression on .sdf file."""
        splittype = "random"
        input_transforms = []
        output_transforms = ["normalize"]
        model_params = {}
        tasks = ["atomization_energy"]
        task_type = "regression"
        task_types = {task: task_type for task in tasks}
        current_dir = os.path.dirname(os.path.abspath(__file__))
        input_file = os.path.join(current_dir, "data/water.sdf")

        featurizer = CoulombMatrixEig(6, remove_hydrogens=False)

        input_file = os.path.join(self.current_dir, input_file)
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            mol_field="mol",
                            featurizer=featurizer,
                            verbosity="low")

        dataset = loader.featurize(input_file, self.data_dir)

        # Splits featurized samples into train/test
        splitter = RandomSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir)
        assert len(train_dataset) == 8
        assert len(valid_dataset) == 1
        assert len(test_dataset) == 1
Exemplo n.º 8
0
    def test_move_load(self):
        """Test that datasets can be moved and loaded."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        data_dir = os.path.join(self.base_dir, "data")
        moved_data_dir = os.path.join(self.base_dir, "moved_data")
        dataset_file = os.path.join(current_dir,
                                    "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir)

        X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
        shutil.move(data_dir, moved_data_dir)

        moved_dataset = DiskDataset(moved_data_dir, reload=True)

        X_moved, y_moved, w_moved, ids_moved = (moved_dataset.X,
                                                moved_dataset.y,
                                                moved_dataset.w,
                                                moved_dataset.ids)

        np.testing.assert_allclose(X, X_moved)
        np.testing.assert_allclose(y, y_moved)
        np.testing.assert_allclose(w, w_moved)
        np.testing.assert_array_equal(ids, ids_moved)
Exemplo n.º 9
0
    def test_reshard_shuffle(self):
        """Test that datasets can be merged."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        data_dir = os.path.join(self.base_dir, "dataset")

        dataset_file = os.path.join(current_dir,
                                    "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir, shard_size=2)

        X_orig, y_orig, w_orig, orig_ids = dataset.to_numpy()
        orig_len = len(dataset)

        dataset.reshard_shuffle(reshard_size=1)
        X_new, y_new, w_new, new_ids = dataset.to_numpy()

        assert len(dataset) == orig_len
        # The shuffling should have switched up the ordering
        assert not np.array_equal(orig_ids, new_ids)
        # But all the same entries should still be present
        assert sorted(orig_ids) == sorted(new_ids)
        # All the data should have same shape
        assert X_orig.shape == X_new.shape
        assert y_orig.shape == y_new.shape
        assert w_orig.shape == w_new.shape
Exemplo n.º 10
0
  def test_reshard_shuffle(self):
    """Test that datasets can be merged."""
    verbosity = "high"
    current_dir = os.path.dirname(os.path.realpath(__file__))
    data_dir = os.path.join(self.base_dir, "dataset")

    dataset_file = os.path.join(
        current_dir, "../../models/tests/example.csv")

    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = DataLoader(tasks=tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(
        dataset_file, data_dir, shard_size=2)

    X_orig, y_orig, w_orig, orig_ids = dataset.to_numpy()
    orig_len = len(dataset)

    dataset.reshard_shuffle(reshard_size=1)
    X_new, y_new, w_new, new_ids = dataset.to_numpy()
    
    assert len(dataset) == orig_len
    # The shuffling should have switched up the ordering
    assert not np.array_equal(orig_ids, new_ids)
    # But all the same entries should still be present
    assert sorted(orig_ids) == sorted(new_ids)
    # All the data should have same shape
    assert X_orig.shape == X_new.shape
    assert y_orig.shape == y_new.shape
    assert w_orig.shape == w_new.shape
Exemplo n.º 11
0
  def random_test_train_valid_test_split_from_sdf(self):
    """Test of singletask CoulombMatrixEig regression on .sdf file."""
    splittype = "random"
    input_transforms = []
    output_transforms = ["normalize"]
    model_params = {}
    tasks = ["atomization_energy"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    current_dir = os.path.dirname(os.path.abspath(__file__))
    input_file = os.path.join(current_dir, "data/water.sdf")

    featurizer = CoulombMatrixEig(6, remove_hydrogens=False)

    input_file = os.path.join(self.current_dir, input_file)
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        mol_field="mol",
                        featurizer=featurizer,
                        verbosity="low")

    dataset = loader.featurize(input_file, self.data_dir)

    # Splits featurized samples into train/test
    splitter = RandomSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)
    assert len(train_dataset) == 8
    assert len(valid_dataset) == 1
    assert len(test_dataset) == 1
Exemplo n.º 12
0
def featurize_and_split(input_file, feature_dir, samples_dir, train_dir,
                        test_dir, splittype, feature_types, input_transforms,
                        output_transforms, tasks, feature_files=None):
  """Featurize inputs with NNScore and do train-test split."""

  loader = DataLoader(tasks=tasks,
                      smiles_field="smiles",
                      protein_pdb_field="protein_pdb",
                      ligand_pdb_field="ligand_pdb",
                      verbose=True)
  
  if feature_files is None:
    print("About to featurize.")
    samples = loader.featurize(input_file, feature_dir,
                                   samples_dir, shard_size=8)
    print("Completed Featurization")
  else:
    # Transform data into arrays for ML
    samples = FeaturizedSamples(samples_dir, feature_files,
                                reload_data=False)

  # Split into train/test
  train_samples, test_samples = samples.train_test_split(
      splittype, train_dir, test_dir)
  print("Finished train test split.")
  train_dataset = Dataset(train_dir, train_samples, feature_types)
  test_dataset = Dataset(test_dir, test_samples, feature_types)
  print("Finished creating train test datasets")
  # Transforming train/test data
  train_dataset.transform(input_transforms, output_transforms)
  test_dataset.transform(input_transforms, output_transforms)
  print("Finished Transforming train test data.")

  return train_dataset, test_dataset
Exemplo n.º 13
0
    def test_drop(self):
        """Test on dataset where RDKit fails on some strings."""
        # Set some global variables up top
        reload = True
        verbosity = "high"
        len_full = 25

        current_dir = os.path.dirname(os.path.realpath(__file__))
        data_dir = os.path.join(self.base_dir, "dataset")
        model_dir = os.path.join(self.base_dir, "model")

        print("About to load emols dataset.")
        dataset_file = os.path.join(current_dir, "mini_emols.csv")

        # Featurize emols dataset
        print("About to featurize datasets.")
        featurizer = CircularFingerprint(size=1024)
        emols_tasks = ['activity']

        loader = DataLoader(tasks=emols_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file,
                                   data_dir,
                                   debug=True,
                                   logging=False)

        X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
        print("ids.shape, X.shape, y.shape, w.shape")
        print(ids.shape, X.shape, y.shape, w.shape)
        assert len(X) == len(y) == len(w) == len(ids)
Exemplo n.º 14
0
  def test_graph_conv_singletask_classification_overfit(self):
    """Test graph-conv multitask overfits tiny data."""
    g = tf.Graph()
    sess = tf.Session(graph=g)
    K.set_session(sess)
    with g.as_default():
      n_tasks = 1
      n_samples = 10
      n_features = 3
      n_classes = 2
      
      # Load mini log-solubility dataset.
      splittype = "scaffold"
      featurizer = ConvMolFeaturizer()
      tasks = ["outcome"]
      task_type = "classification"
      task_types = {task: task_type for task in tasks}
      input_file = os.path.join(self.current_dir, "example_classification.csv")
      loader = DataLoader(tasks=tasks,
                          smiles_field=self.smiles_field,
                          featurizer=featurizer,
                          verbosity="low")
      dataset = loader.featurize(input_file, self.data_dir)

      verbosity = "high"
      classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity)

      #n_atoms = 50
      n_feat = 71
      batch_size = 10
      graph_model = SequentialGraphModel(n_feat)
      graph_model.add(GraphConv(64, activation='relu'))
      graph_model.add(BatchNormalization(epsilon=1e-5, mode=1))
      graph_model.add(GraphPool())
      # Gather Projection
      graph_model.add(Dense(128, activation='relu'))
      graph_model.add(BatchNormalization(epsilon=1e-5, mode=1))
      graph_model.add(GraphGather(batch_size, activation="tanh"))

      with self.test_session() as sess:
        model = MultitaskGraphClassifier(
          sess, graph_model, n_tasks, self.model_dir, batch_size=batch_size,
          learning_rate=1e-3, learning_rate_decay_time=1000,
          optimizer_type="adam", beta1=.9, beta2=.999, verbosity="high")

        # Fit trained model
        model.fit(dataset, nb_epoch=20)
        model.save()

        # Eval model on train
        transformers = []
        evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

      ######################################################### DEBUG
      print("scores")
      print(scores)
      ######################################################### DEBUG
      assert scores[classification_metric.name] > .85
Exemplo n.º 15
0
  def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Tensorflow multitask deepchem classification API."""
    splittype = "scaffold"
    task_type = "classification"

    input_file = os.path.join(self.current_dir, "multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    featurizer = CircularFingerprint(size=1024)

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    transformers = []

    metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification")
    params_dict = {"activation": ["relu"],
                    "momentum": [.9],
                    "batch_size": [50],
                    "init": ["glorot_uniform"],
                    "data_shape": [train_dataset.get_data_shape()],
                    "learning_rate": [1e-3],
                    "decay": [1e-6],
                    "nb_hidden": [1000], 
                    "nb_epoch": [1],
                    "nesterov": [False],
                    "dropouts": [(.5,)],
                    "nb_layers": [1],
                    "batchnorm": [False],
                    "layer_sizes": [(1000,)],
                    "weight_init_stddevs": [(.1,)],
                    "bias_init_consts": [(1.,)],
                    "num_classes": [2],
                    "penalty": [0.], 
                    "optimizer": ["sgd"],
                    "num_classification_tasks": [len(task_types)]
                  }

    def model_builder(tasks, task_types, params_dict, logdir, verbosity=None):
        return TensorflowModel(
            tasks, task_types, params_dict, logdir, 
            tf_class=TensorflowMultiTaskClassifier,
            verbosity=verbosity)
    optimizer = HyperparamOpt(model_builder, tasks, task_types,
                              verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric, logdir=None)
Exemplo n.º 16
0
    def test_singletask_sklearn_rf_ECFP_regression_API(self):
        """Test of singletask RF ECFP regression API."""
        splittype = "scaffold"
        featurizer = CircularFingerprint(size=1024)
        model_params = {}
        tasks = ["log-solubility"]
        task_type = "regression"
        task_types = {task: task_type for task in tasks}
        input_file = os.path.join(self.current_dir, "example.csv")
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        input_transformers = []
        output_transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        transformers = input_transformers + output_transformers
        model_params["data_shape"] = train_dataset.get_data_shape()
        regression_metrics = [
            Metric(metrics.r2_score),
            Metric(metrics.mean_squared_error),
            Metric(metrics.mean_absolute_error)
        ]

        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=RandomForestRegressor())

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        evaluator = Evaluator(model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(regression_metrics)

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(regression_metrics)
Exemplo n.º 17
0
def load_muv(base_dir, reload=True):
  """Load MUV datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load MUV dataset
  print("About to load MUV dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/muv.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize MUV dataset
  print("About to featurize MUV dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
                          'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
                          'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
                          'MUV-466', 'MUV-832'])

  loader = DataLoader(tasks=all_MUV_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_MUV_tasks, dataset, transformers
Exemplo n.º 18
0
  def test_multitask_keras_mlp_ECFP_classification_API(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    g = tf.Graph()
    sess = tf.Session(graph=g)
    K.set_session(sess)
    with g.as_default():
      task_type = "classification"
      # TODO(rbharath): There should be some automatic check to ensure that all
      # required model_params are specified.
      # TODO(rbharath): Turning off dropout to make tests behave.
      model_params = {"nb_hidden": 10, "activation": "relu",
                      "dropout": .0, "learning_rate": .01,
                      "momentum": .9, "nesterov": False,
                      "decay": 1e-4, "batch_size": 5,
                      "nb_epoch": 2, "init": "glorot_uniform",
                      "nb_layers": 1, "batchnorm": False}

      input_file = os.path.join(self.current_dir, "multitask_example.csv")
      tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
               "task7", "task8", "task9", "task10", "task11", "task12",
               "task13", "task14", "task15", "task16"]
      task_types = {task: task_type for task in tasks}

      featurizer = CircularFingerprint(size=1024)

      loader = DataLoader(tasks=tasks,
                          smiles_field=self.smiles_field,
                          featurizer=featurizer,
                          verbosity="low")
      dataset = loader.featurize(input_file, self.data_dir)
      splitter = ScaffoldSplitter()
      train_dataset, test_dataset = splitter.train_test_split(
          dataset, self.train_dir, self.test_dir)

      transformers = []
      model_params["data_shape"] = train_dataset.get_data_shape()
      classification_metrics = [Metric(metrics.roc_auc_score),
                                Metric(metrics.matthews_corrcoef),
                                Metric(metrics.recall_score),
                                Metric(metrics.accuracy_score)]
      
      model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir)

      # Fit trained model
      model.fit(train_dataset)
      model.save()

      # Eval model on train
      evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
      _ = evaluator.compute_model_performance(classification_metrics)

      # Eval model on test
      evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
      _ = evaluator.compute_model_performance(classification_metrics)
Exemplo n.º 19
0
def load_tox21(base_dir, reload=True):
  """Load Tox21 datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  samples_dir = os.path.join(base_dir, "samples")
  data_dir = os.path.join(base_dir, "dataset")

  # Load Tox21 dataset
  print("About to load Tox21 dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/tox21.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize Tox21 dataset
  print("About to featurize Tox21 dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
                     'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',
                     'SR-HSE', 'SR-MMP', 'SR-p53']

  if not reload or not os.path.exists(data_dir):
    loader = DataLoader(tasks=all_tox21_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(
        dataset_file, data_dir, shard_size=8192)
  else:
    dataset = Dataset(data_dir, all_tox21_tasks, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  if not reload:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_tox21_tasks, dataset, transformers
Exemplo n.º 20
0
  def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self):
    """Test of singletask RF RDKIT-descriptor regression API."""
    splittype = "scaffold"
    featurizer = RDKitDescriptors()
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    model_params = {}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="regression",
                         model_instance=RandomForestRegressor())
  

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
Exemplo n.º 21
0
 def load_feat_multitask_data(self):
     """Load example with numerical features, tasks."""
     if os.path.exists(self.data_dir):
         shutil.rmtree(self.data_dir)
     features = ["feat0", "feat1", "feat2", "feat3", "feat4", "feat5"]
     featurizer = UserDefinedFeaturizer(features)
     tasks = ["task0", "task1", "task2", "task3", "task4", "task5"]
     input_file = os.path.join(
         self.current_dir, "../../models/tests/feat_multitask_example.csv")
     loader = DataLoader(tasks=tasks,
                         featurizer=featurizer,
                         id_field="id",
                         verbosity="low")
     return loader.featurize(input_file, self.data_dir)
Exemplo n.º 22
0
 def load_classification_data(self):
     """Loads classification data from example.csv"""
     if os.path.exists(self.data_dir):
         shutil.rmtree(self.data_dir)
     featurizer = CircularFingerprint(size=1024)
     tasks = ["outcome"]
     task_type = "classification"
     input_file = os.path.join(
         self.current_dir, "../../models/tests/example_classification.csv")
     loader = DataLoader(tasks=tasks,
                         smiles_field=self.smiles_field,
                         featurizer=featurizer,
                         verbosity="low")
     return loader.featurize(input_file, self.data_dir)
Exemplo n.º 23
0
    def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None):
        """Loads or reloads a small version of MUV dataset."""
        # Load MUV dataset
        raw_dataset = load_from_disk(dataset_file)
        print("Number of examples in dataset: %s" % str(raw_dataset.shape[0]))

        print("About to featurize compounds")
        featurizer = CircularFingerprint(size=1024)
        MUV_tasks = [
            'MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548',
            'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858',
            'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832'
        ]
        loader = DataLoader(tasks=MUV_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, self.data_dir)
        assert len(dataset) == len(raw_dataset)

        print("About to split compounds into train/valid/test")
        splitter = ScaffoldSplitter(verbosity=verbosity)
        frac_train, frac_valid, frac_test = .8, .1, .1
        train_dataset, valid_dataset, test_dataset = \
            splitter.train_valid_test_split(
                dataset, self.train_dir, self.valid_dir, self.test_dir,
                log_every_n=1000, frac_train=frac_train,
                frac_test=frac_test, frac_valid=frac_valid)
        # Do an approximate comparison since splits are sometimes slightly off from
        # the exact fraction.
        assert relative_difference(len(train_dataset),
                                   frac_train * len(dataset)) < 1e-3
        assert relative_difference(len(valid_dataset),
                                   frac_valid * len(dataset)) < 1e-3
        assert relative_difference(len(test_dataset),
                                   frac_test * len(dataset)) < 1e-3

        # TODO(rbharath): Transformers don't play nice with reload! Namely,
        # reloading will cause the transform to be reapplied. This is undesirable in
        # almost all cases. Need to understand a method to fix this.
        transformers = [
            BalancingTransformer(transform_w=True, dataset=train_dataset)
        ]
        print("Transforming datasets")
        for dataset in [train_dataset, valid_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(dataset)

        return (len(train_dataset), len(valid_dataset), len(test_dataset))
Exemplo n.º 24
0
  def test_singletask_sklearn_rf_ECFP_regression_sharded_API(self):
    """Test of singletask RF ECFP regression API: sharded edition."""
    splittype = "scaffold"
    featurizer = CircularFingerprint(size=1024)
    model_params = {}
    tasks = ["label"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(
        self.current_dir, "../../../datasets/pdbbind_core_df.pkl.gz")

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)
    # We set shard size above to force the creation of multiple shards of the data.
    # pdbbind_core has ~200 examples.
    model_params["data_shape"] = train_dataset.get_data_shape()
    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    model = SklearnModel(tasks, task_types, model_params, self.model_dir,
                         mode="regression",
                         model_instance=RandomForestRegressor())

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
Exemplo n.º 25
0
    def test_log_solubility_dataset(self):
        """Test of loading for simple log-solubility dataset."""
        current_dir = os.path.dirname(os.path.realpath(__file__))
        input_file = "../../models/tests/example.csv"
        input_file = os.path.join(current_dir, input_file)

        tasks = ["log-solubility"]
        smiles_field = "smiles"
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=CircularFingerprint(size=1024),
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        assert len(dataset) == 10
Exemplo n.º 26
0
 def load_feat_multitask_data(self):
   """Load example with numerical features, tasks."""
   if os.path.exists(self.data_dir):
     shutil.rmtree(self.data_dir)
   features = ["feat0", "feat1", "feat2", "feat3", "feat4", "feat5"]
   featurizer = UserDefinedFeaturizer(features)
   tasks = ["task0", "task1", "task2", "task3", "task4", "task5"]
   input_file = os.path.join(
       self.current_dir, "../../models/tests/feat_multitask_example.csv")
   loader = DataLoader(
       tasks=tasks,
       featurizer=featurizer,
       id_field="id",
       verbosity="low")
   return loader.featurize(input_file, self.data_dir)
Exemplo n.º 27
0
 def load_sparse_multitask_dataset(self):
   """Load sparse tox multitask data, sample dataset."""
   if os.path.exists(self.data_dir):
     shutil.rmtree(self.data_dir)
   featurizer = CircularFingerprint(size=1024)
   tasks = ["task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9"]
   input_file = os.path.join(
       self.current_dir, "../../models/tests/sparse_multitask_example.csv")
   loader = DataLoader(
       tasks=tasks,
       smiles_field="smiles",
       featurizer=featurizer,
       verbosity="low")
   return loader.featurize(input_file, self.data_dir)
Exemplo n.º 28
0
  def test_log_solubility_dataset(self):
    """Test of loading for simple log-solubility dataset."""
    current_dir = os.path.dirname(os.path.realpath(__file__))
    input_file = "../../models/tests/example.csv"
    input_file = os.path.join(current_dir, input_file)

    tasks = ["log-solubility"]
    smiles_field = "smiles"
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=CircularFingerprint(size=1024),
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)
    
    assert len(dataset) == 10
Exemplo n.º 29
0
 def load_classification_data(self):
   """Loads classification data from example.csv"""
   if os.path.exists(self.data_dir):
     shutil.rmtree(self.data_dir)
   featurizer = CircularFingerprint(size=1024)
   tasks = ["outcome"]
   task_type = "classification"
   input_file = os.path.join(
       self.current_dir, "../../models/tests/example_classification.csv")
   loader = DataLoader(
       tasks=tasks,
       smiles_field=self.smiles_field,
       featurizer=featurizer,
       verbosity="low")
   return loader.featurize(input_file, self.data_dir)
Exemplo n.º 30
0
 def load_gaussian_cdf_data(self):
     """Load example with numbers sampled from Gaussian normal distribution.
    Each feature and task is a column of values that is sampled
    from a normal distribution of mean 0, stdev 1."""
     if os.path.exists(self.data_dir):
         shutil.rmtree(self.data_dir)
     features = ["feat0", "feat1"]
     featurizer = UserDefinedFeaturizer(features)
     tasks = ["task0", "task1"]
     input_file = os.path.join(
         self.current_dir, "../../models/tests/gaussian_cdf_example.csv")
     loader = DataLoader(tasks=tasks,
                         featurizer=featurizer,
                         id_field="id",
                         verbosity=None)
     return loader.featurize(input_file, self.data_dir)
Exemplo n.º 31
0
 def load_multitask_data(self):
   """Load example multitask data."""
   if os.path.exists(self.data_dir):
     shutil.rmtree(self.data_dir)
   featurizer = CircularFingerprint(size=1024)
   tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12",
            "task13", "task14", "task15", "task16"]
   input_file = os.path.join(
       self.current_dir, "../../models/tests/multitask_example.csv")
   loader = DataLoader(
       tasks=tasks,
       smiles_field=self.smiles_field,
       featurizer=featurizer,
       verbosity="low")
   return loader.featurize(input_file, self.data_dir)
Exemplo n.º 32
0
  def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None):
    """Loads or reloads a small version of MUV dataset."""
    # Load MUV dataset
    raw_dataset = load_from_disk(dataset_file)
    print("Number of examples in dataset: %s" % str(raw_dataset.shape[0]))

    print("About to featurize compounds")
    featurizer = CircularFingerprint(size=1024)
    MUV_tasks = ['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
                 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
                 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
                 'MUV-466', 'MUV-832']
    loader = DataLoader(tasks=MUV_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(dataset_file, self.data_dir)
    assert len(dataset) == len(raw_dataset)

    print("About to split compounds into train/valid/test")
    splitter = ScaffoldSplitter(verbosity=verbosity)
    frac_train, frac_valid, frac_test = .8, .1, .1
    train_dataset, valid_dataset, test_dataset = \
        splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir,
            log_every_n=1000, frac_train=frac_train,
            frac_test=frac_test, frac_valid=frac_valid)
    # Do an approximate comparison since splits are sometimes slightly off from
    # the exact fraction.
    assert relative_difference(
        len(train_dataset), frac_train * len(dataset)) < 1e-3
    assert relative_difference(
        len(valid_dataset), frac_valid * len(dataset)) < 1e-3
    assert relative_difference(
        len(test_dataset), frac_test * len(dataset)) < 1e-3

    # TODO(rbharath): Transformers don't play nice with reload! Namely,
    # reloading will cause the transform to be reapplied. This is undesirable in
    # almost all cases. Need to understand a method to fix this.
    transformers = [
        BalancingTransformer(transform_w=True, dataset=train_dataset)]
    print("Transforming datasets")
    for dataset in [train_dataset, valid_dataset, test_dataset]:
      for transformer in transformers:
          transformer.transform(dataset)

    return (len(train_dataset), len(valid_dataset), len(test_dataset))
Exemplo n.º 33
0
  def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self):
    """Test of singletask RF RDKIT-descriptor regression API."""
    splittype = "scaffold"
    featurizer = RDKitDescriptors()
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    sklearn_model = RandomForestRegressor()
    model = SklearnModel(sklearn_model, self.model_dir)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
Exemplo n.º 34
0
    def test_multitask_order(self):
        """Test that order of tasks in multitask datasets is preserved."""
        from deepchem.models.keras_models.fcnet import MultiTaskDNN
        splittype = "scaffold"
        output_transformers = []
        input_transformers = []
        task_type = "classification"
        # TODO(rbharath): There should be some automatic check to ensure that all
        # required model_params are specified.
        model_params = {
            "nb_hidden": 10,
            "activation": "relu",
            "dropout": .5,
            "learning_rate": .01,
            "momentum": .9,
            "nesterov": False,
            "decay": 1e-4,
            "batch_size": 5,
            "nb_epoch": 2,
            "init": "glorot_uniform",
            "nb_layers": 1,
            "batchnorm": False
        }

        input_file = os.path.join(self.current_dir, "multitask_example.csv")
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]
        task_types = {task: task_type for task in tasks}

        featurizer = CircularFingerprint(size=1024)

        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        assert train_dataset.get_task_names() == tasks
        assert test_dataset.get_task_names() == tasks
Exemplo n.º 35
0
  def test_singletask_tf_mlp_ECFP_classification_API(self):
    """Straightforward test of Tensorflow singletask deepchem classification API."""
    n_features = 1024
    featurizer = CircularFingerprint(size=n_features)

    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    
    transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]

    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    classification_metrics = [Metric(metrics.roc_auc_score),
                              Metric(metrics.matthews_corrcoef),
                              Metric(metrics.recall_score),
                              Metric(metrics.accuracy_score)]

    tensorflow_model = TensorflowMultiTaskClassifier(
        len(tasks), n_features, self.model_dir)
    model = TensorflowModel(tensorflow_model, self.model_dir)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)
Exemplo n.º 36
0
  def test_multitask_keras_mlp_ECFP_classification_API(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    g = tf.Graph()
    sess = tf.Session(graph=g)
    K.set_session(sess)
    with g.as_default():
      task_type = "classification"
      input_file = os.path.join(self.current_dir, "multitask_example.csv")
      tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
               "task7", "task8", "task9", "task10", "task11", "task12",
               "task13", "task14", "task15", "task16"]

      n_features = 1024
      featurizer = CircularFingerprint(size=n_features)
      loader = DataLoader(tasks=tasks,
                          smiles_field=self.smiles_field,
                          featurizer=featurizer,
                          verbosity="low")
      dataset = loader.featurize(input_file, self.data_dir)
      splitter = ScaffoldSplitter()
      train_dataset, test_dataset = splitter.train_test_split(
          dataset, self.train_dir, self.test_dir)

      transformers = []
      classification_metrics = [Metric(metrics.roc_auc_score),
                                Metric(metrics.matthews_corrcoef),
                                Metric(metrics.recall_score),
                                Metric(metrics.accuracy_score)]
      
      keras_model = MultiTaskDNN(len(tasks), n_features, "classification",
                                 dropout=0.)
      model = KerasModel(keras_model, self.model_dir)

      # Fit trained model
      model.fit(train_dataset)
      model.save()

      # Eval model on train
      evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
      _ = evaluator.compute_model_performance(classification_metrics)

      # Eval model on test
      evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
      _ = evaluator.compute_model_performance(classification_metrics)
Exemplo n.º 37
0
    def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self):
        """Straightforward test of Keras multitask deepchem classification API."""
        task_type = "classification"
        input_file = os.path.join(self.current_dir, "multitask_example.csv")
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]

        n_features = 1024
        featurizer = CircularFingerprint(size=n_features)
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir)

        transformers = []
        metric = Metric(metrics.matthews_corrcoef,
                        np.mean,
                        mode="classification")
        params_dict = {"n_hidden": [5, 10]}

        def model_builder(model_params, model_dir):
            keras_model = MultiTaskDNN(len(tasks),
                                       n_features,
                                       task_type,
                                       dropout=0.,
                                       **model_params)
            return KerasModel(keras_model, model_dir)

        optimizer = HyperparamOpt(model_builder, verbosity="low")
        best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
            params_dict,
            train_dataset,
            valid_dataset,
            transformers,
            metric,
            logdir=None)
Exemplo n.º 38
0
    def test_merge(self):
        """Test that datasets can be merged."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        first_data_dir = os.path.join(self.base_dir, "first_dataset")
        second_data_dir = os.path.join(self.base_dir, "second_dataset")
        merged_data_dir = os.path.join(self.base_dir, "merged_data")

        dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity)
        first_dataset = loader.featurize(dataset_file, first_data_dir)
        second_dataset = loader.featurize(dataset_file, second_data_dir)

        merged_dataset = Dataset.merge(merged_data_dir, [first_dataset, second_dataset])

        assert len(merged_dataset) == len(first_dataset) + len(second_dataset)
Exemplo n.º 39
0
  def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self):
    """Straightforward test of Keras multitask deepchem classification API."""
    task_type = "classification"
    input_file = os.path.join(self.current_dir, "multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    featurizer = CircularFingerprint(size=1024)
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    transformers = []
    metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification")
    params_dict= {"nb_hidden": [5, 10],
                  "activation": ["relu"],
                  "dropout": [.5],
                  "learning_rate": [.01],
                  "momentum": [.9],
                  "nesterov": [False],
                  "decay": [1e-4],
                  "batch_size": [5],
                  "nb_epoch": [2],
                  "init": ["glorot_uniform"],
                  "nb_layers": [1],
                  "batchnorm": [False],
                  "data_shape": [train_dataset.get_data_shape()]}
    
    optimizer = HyperparamOpt(MultiTaskDNN, tasks, task_types,
                              verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, transformers,
      metric, logdir=None)
Exemplo n.º 40
0
    def test_multitask_order(self):
        """Test that order of tasks in multitask datasets is preserved."""
        input_file = os.path.join(self.current_dir, "multitask_example.csv")
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]

        featurizer = CircularFingerprint(size=1024)

        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        assert train_dataset.get_task_names() == tasks
        assert test_dataset.get_task_names() == tasks
Exemplo n.º 41
0
    def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self):
        """Test of hyperparam_opt with singletask RF ECFP regression API."""
        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        input_file = os.path.join(self.current_dir, "example.csv")
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir)

        transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        for dataset in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(dataset)

        params_dict = {"n_estimators": [10, 100]}
        metric = Metric(metrics.r2_score)

        def rf_model_builder(model_params, model_dir):
            sklearn_model = RandomForestRegressor(**model_params)
            return SklearnModel(sklearn_model, model_dir)

        optimizer = HyperparamOpt(rf_model_builder, verbosity="low")
        best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
            params_dict,
            train_dataset,
            valid_dataset,
            transformers,
            metric,
            logdir=None)
Exemplo n.º 42
0
    def test_merge(self):
        """Test that datasets can be merged."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        first_data_dir = os.path.join(self.base_dir, "first_dataset")
        second_data_dir = os.path.join(self.base_dir, "second_dataset")
        merged_data_dir = os.path.join(self.base_dir, "merged_data")

        dataset_file = os.path.join(current_dir,
                                    "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        first_dataset = loader.featurize(dataset_file, first_data_dir)
        second_dataset = loader.featurize(dataset_file, second_data_dir)

        merged_dataset = DiskDataset.merge(merged_data_dir,
                                           [first_dataset, second_dataset])

        assert len(merged_dataset) == len(first_dataset) + len(second_dataset)
Exemplo n.º 43
0
  def load_solubility_data(self):
    """Loads solubility data from example.csv"""
    if os.path.exists(self.data_dir):
      shutil.rmtree(self.data_dir)
    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    task_type = "regression"
    input_file = os.path.join(self.current_dir, "../../models/tests/example.csv")
    featurizer = DataLoader(
        tasks=tasks,
        smiles_field=self.smiles_field,
        featurizer=featurizer,
        verbosity="low")

    return featurizer.featurize(input_file, self.data_dir)
Exemplo n.º 44
0
  def test_multitask_order(self):
    """Test that order of tasks in multitask datasets is preserved."""
    from deepchem.models.keras_models.fcnet import MultiTaskDNN
    splittype = "scaffold"
    output_transformers = []
    input_transformers = []
    task_type = "classification"
    # TODO(rbharath): There should be some automatic check to ensure that all
    # required model_params are specified.
    model_params = {"nb_hidden": 10, "activation": "relu",
                    "dropout": .5, "learning_rate": .01,
                    "momentum": .9, "nesterov": False,
                    "decay": 1e-4, "batch_size": 5,
                    "nb_epoch": 2, "init": "glorot_uniform",
                    "nb_layers": 1, "batchnorm": False}

    input_file = os.path.join(self.current_dir, "multitask_example.csv")
    tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6",
             "task7", "task8", "task9", "task10", "task11", "task12",
             "task13", "task14", "task15", "task16"]
    task_types = {task: task_type for task in tasks}

    featurizer = CircularFingerprint(size=1024)

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
  
    assert train_dataset.get_task_names() == tasks
    assert test_dataset.get_task_names() == tasks
Exemplo n.º 45
0
def load_sweet(base_dir, reload=True, frac_train=.8):
  """Load sweet datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")

  # Load SWEET dataset
  print("About to load SWEET dataset.")
  dataset_file = os.path.join(
      current_dir, "./sweet.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize SWEET dataset
  print("About to featurize SWEET dataset.")
  featurizer = CircularFingerprint(size=1024)
  SWEET_tasks = dataset.columns.values[1:].tolist()

  loader = DataLoader(tasks=SWEET_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = DiskDataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  if regen:
    print("About to transform data")
    for transformer in transformers:
        dataset = transformer.transform(dataset)

  X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
  num_tasks = 17
  num_train = frac_train * len(dataset)
  SWEET_tasks = SWEET_tasks[:num_tasks]
  print("Using following tasks")
  print(SWEET_tasks)
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, SWEET_tasks)
  valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, SWEET_tasks)
  
  return SWEET_tasks, (train_dataset, valid_dataset), transformers
Exemplo n.º 46
0
  def test_multiload(self):
    """Check can re-use featurization for multiple task selections.

    TODO(rbharath): This test seems silly after the recent round of
                    refactoring. Can it be removed?
    """
    # Only for debug!
    np.random.seed(123)

    # Set some global variables up top
    reload = True
    verbosity = "high"


    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(self.base_dir, "dataset")
    train_dir = os.path.join(self.base_dir, "train_dataset")
    valid_dir = os.path.join(self.base_dir, "valid_dataset")
    test_dir = os.path.join(self.base_dir, "test_dataset")
    model_dir = os.path.join(self.base_dir, "model")

    # Load dataset
    print("About to load dataset.")
    dataset_file = os.path.join(
        current_dir, "../../models/tests/multitask_example.csv")
    dataset = load_from_disk(dataset_file)
    print("Columns of dataset: %s" % str(dataset.columns.values))
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))

    # Featurize tox21 dataset
    print("About to featurize dataset.")
    featurizer = CircularFingerprint(size=1024)
    all_tasks = ["task%d"%i for i in range(17)] 

    ####### Do featurization
    loader = DataLoader(tasks=all_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(
        dataset_file, data_dir)

    # Do train/valid split.
    X_multi, y_multi, w_multi, ids_multi = dataset.to_numpy()


    ####### Do singletask load
    y_tasks, w_tasks, = [], []
    for ind, task in enumerate(all_tasks):
      print("Processing task %s" % task)
      dataset = Dataset(data_dir, verbosity=verbosity, reload=reload)

      X_task, y_task, w_task, ids_task = dataset.to_numpy()
      y_tasks.append(y_task[:, ind])
      w_tasks.append(w_task[:, ind])

    ################## Do comparison
    for ind, task in enumerate(all_tasks):
      y_multi_task = y_multi[:, ind]
      w_multi_task = w_multi[:, ind]

      y_task = y_tasks[ind]
      w_task = w_tasks[ind]

      np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten())
      np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())
Exemplo n.º 47
0
  def test_singletask_tf_mlp_ECFP_classification_API(self):
    """Straightforward test of Tensorflow singletask deepchem classification API."""
    splittype = "scaffold"
    output_transformers = []
    input_transformers = []
    task_type = "classification"

    featurizer = CircularFingerprint(size=1024)

    tasks = ["outcome"]
    task_type = "classification"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example_classification.csv")

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    
    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers

    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    model_params = {
      "batch_size": 2,
      "num_classification_tasks": 1,
      "num_features": 1024,
      "layer_sizes": [1024],
      "weight_init_stddevs": [1.],
      "bias_init_consts": [0.],
      "dropouts": [.5],
      "num_classes": 2,
      "nb_epoch": 1,
      "penalty": 0.0,
      "optimizer": "adam",
      "learning_rate": .001,
      "data_shape": train_dataset.get_data_shape()
    }
    classification_metrics = [Metric(metrics.roc_auc_score),
                              Metric(metrics.matthews_corrcoef),
                              Metric(metrics.recall_score),
                              Metric(metrics.accuracy_score)]

    model = TensorflowModel(
        tasks, task_types, model_params, self.model_dir,
        tf_class=TensorflowMultiTaskClassifier)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)
Exemplo n.º 48
0
def load_nci(base_dir, reload=True, force_transform=False):
  """Load NCI datasets. Does not do train/test split"""
  # Set some global variables up top
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      print("Deleting dir in nci_datasets.py")
      print(base_dir)
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load nci dataset
  print("About to load NCI dataset.")
  dataset_file1_path = os.path.join(
      current_dir, "../../datasets/nci_1.csv.gz")
  dataset_file2_path = os.path.join(
      current_dir, "../../datasets/nci_2.csv.gz")
  dataset_paths = [dataset_file1_path, dataset_file2_path]
  dataset = load_sharded_csv(dataset_paths)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize nci dataset
  print("About to featurize nci dataset.")
  featurizer = CircularFingerprint(size=1024)
  #was sorted list originally in muv_datasets.py, but csv is ordered so removed
  all_nci_tasks = (['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226',
                    'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226',
                    'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205',
                    'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620',
                    'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251',
                    'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2',
                    'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1',
                    'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES',
                    'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393',
                    'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7',
                    'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549',
                    'T-47D'])

  loader = DataLoader(tasks=all_nci_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_paths, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers
  transformers = []
  if regen or force_transform:
    print("About to transform data")
    transformers = [
        NormalizationTransformer(transform_y=True, dataset=dataset)]
    for transformer in transformers:
        transformer.transform(dataset)

  return all_nci_tasks, dataset, transformers
Exemplo n.º 49
0
def load_bace(mode="regression", transform=True, split="20-80"):
  """Load BACE-1 dataset as regression/classification problem."""
  reload = True
  verbosity = "high"
  regen = False
  assert split in ["20-80", "80-20"]

  current_dir = os.path.dirname(os.path.realpath(__file__))
  if split == "20-80":
    dataset_file = os.path.join(
        current_dir, "../../datasets/desc_canvas_aug30.csv")
  elif split == "80-20":
    dataset_file = os.path.join(
        current_dir, "../../datasets/rev8020split_desc.csv")
  dataset = load_from_disk(dataset_file)
  num_display = 10
  pretty_columns = (
      "[" + ",".join(["'%s'" % column for column in
  dataset.columns.values[:num_display]])
      + ",...]")

  crystal_dataset_file = os.path.join(
      current_dir, "../../datasets/crystal_desc_canvas_aug30.csv")
  crystal_dataset = load_from_disk(crystal_dataset_file)

  print("Columns of dataset: %s" % pretty_columns)
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))
  print("Number of examples in crystal dataset: %s" %
  str(crystal_dataset.shape[0]))

  #Make directories to store the raw and featurized datasets.
  base_dir = tempfile.mkdtemp()
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")
  test_dir = os.path.join(base_dir, "test_dataset")
  model_dir = os.path.join(base_dir, "model")
  crystal_dir = os.path.join(base_dir, "crystal")

  if mode == "regression":
    bace_tasks = ["pIC50"]
  elif mode == "classification":
    bace_tasks = ["Class"]
  else:
    raise ValueError("Unknown mode %s" % mode)
  featurizer = UserDefinedFeaturizer(user_specified_features)
  loader = DataLoader(tasks=bace_tasks,
                              smiles_field="mol",
                              id_field="CID",
                              featurizer=featurizer)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)
  if not reload or not os.path.exists(crystal_dir):
    crystal_dataset = loader.featurize(crystal_dataset_file, crystal_dir)
  else:
    crystal_dataset = Dataset(crystal_dir, reload=True)


  if (not reload or not os.path.exists(train_dir) or not os.path.exists(valid_dir)
      or not os.path.exists(test_dir)):
    regen = True
    splitter = SpecifiedSplitter(dataset_file, "Model", verbosity=verbosity)
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, train_dir, valid_dir, test_dir)
  else:
    train_dataset = Dataset(train_dir, reload=True)
    valid_dataset = Dataset(valid_dir, reload=True)
    test_dataset = Dataset(test_dir, reload=True)

  #NOTE THE RENAMING:
  if split == "20-80":
    valid_dataset, test_dataset = test_dataset, valid_dataset
  print("Number of compounds in train set")
  print(len(train_dataset))
  print("Number of compounds in validation set")
  print(len(valid_dataset))
  print("Number of compounds in test set")
  print(len(test_dataset))
  print("Number of compounds in crystal set")
  print(len(crystal_dataset))

  if transform and regen:
    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = []
    if mode == "regression":
      output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    else:
      output_transformers = []
  else:
    input_transformers, output_transformers = [], []
  
  transformers = input_transformers + output_transformers
  for dataset in [train_dataset, valid_dataset, test_dataset, crystal_dataset]:
    for transformer in transformers:
        dataset = transformer.transform(dataset)

  return (bace_tasks, train_dataset, valid_dataset, test_dataset,
          crystal_dataset, output_transformers)
Exemplo n.º 50
0
  def test_singletask_matches_multitask_load(self):
    """Check that singletask load and multitask load of dataset are same."""
    # Only for debug!
    np.random.seed(123)

    # Set some global variables up top
    reload = True
    verbosity = "high"

    base_dir = tempfile.mkdtemp()

    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train_dataset")
    valid_dir = os.path.join(base_dir, "valid_dataset")
    test_dir = os.path.join(base_dir, "test_dataset")
    model_dir = os.path.join(base_dir, "model")

    # Load dataset
    print("About to load dataset.")
    dataset_file = os.path.join(
        current_dir, "../../models/tests/multitask_example.csv")
    dataset = load_from_disk(dataset_file)
    print("Columns of dataset: %s" % str(dataset.columns.values))
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))

    # Featurize tox21 dataset
    print("About to featurize dataset.")
    featurizer = CircularFingerprint(size=1024)
    all_tasks = ["task%d"%i for i in range(17)] 
    # For debugging purposes
    n_tasks = 17 
    tasks = all_tasks[0:n_tasks]

    ####### Do multitask load
    loader = DataLoader(tasks=tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    dataset = loader.featurize(dataset_file, data_dir)

    # Do train/valid split.
    X_multi, y_multi, w_multi, ids_multi = dataset.to_numpy()


    ####### Do singletask load
    y_tasks, w_tasks, ids_tasks = [], [], []
    for task in tasks:
      print("Processing task %s" % task)
      if os.path.exists(data_dir):
        shutil.rmtree(data_dir)
      loader = DataLoader(tasks=[task],
                          smiles_field="smiles",
                          featurizer=featurizer,
                          verbosity=verbosity)
      dataset = loader.featurize(dataset_file, data_dir)

      X_task, y_task, w_task, ids_task = dataset.to_numpy()
      y_tasks.append(y_task)
      w_tasks.append(w_task)
      ids_tasks.append(ids_task)

    ################## Do comparison
    for ind, task in enumerate(tasks):
      y_multi_task = y_multi[:, ind]
      w_multi_task = w_multi[:, ind]

      y_task = y_tasks[ind]
      w_task = w_tasks[ind]
      ids_task = ids_tasks[ind]

      np.testing.assert_allclose(y_multi_task.flatten(), y_task.flatten())
      np.testing.assert_allclose(w_multi_task.flatten(), w_task.flatten())
    shutil.rmtree(base_dir)
Exemplo n.º 51
0
featurized_samples_file = os.path.join(data_dir, "featurized_samples.joblib")

feature_dir = os.path.join(base_dir, "features")
if not os.path.exists(feature_dir):
    os.makedirs(feature_dir)

samples_dir = os.path.join(base_dir, "samples")
if not os.path.exists(samples_dir):
    os.makedirs(samples_dir)



featurizers = compound_featurizers + complex_featurizers
featurizer = DataLoader(tasks=["label"],
                        smiles_field="smiles",
                        protein_pdb_field="protein_pdb",
                        ligand_pdb_field="ligand_pdb",
                        compound_featurizers=compound_featurizers,
                        complex_featurizers=complex_featurizers,
                        id_field="complex_id",
                        verbose=False)
from ipyparallel import Client
c = Client()
print("c.ids")
print(c.ids)
dview = c[:]
featurized_samples = featurizer.featurize(dataset_file, feature_dir, samples_dir,
                                          worker_pool=dview, shard_size=1024)

save_to_disk(featurized_samples, featurized_samples_file)
Exemplo n.º 52
0
def load_pcba(base_dir, reload=True):
  """Load PCBA datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")

  # Load PCBA dataset
  print("About to load PCBA dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/pcba.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizer = CircularFingerprint(size=1024)
  all_PCBA_tasks = [
      'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
      'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
      'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688',
      'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242',
      'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546',
      'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676',
      'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294',
      'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349',
      'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947',
      'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339',
      'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842',
      'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317',
      'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590',
      'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233',
      'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171',
      'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288',
      'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635',
      'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104',
      'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979',
      'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553',
      'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709',
      'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881',
      'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899',
      'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
      'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

  loader = DataLoader(tasks=all_PCBA_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = Dataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]

  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)
  
  return all_PCBA_tasks, dataset, transformers