Пример #1
0
  def _to_singletask(dataset, task_dirs):
    """Transforms a multitask dataset to a collection of singletask datasets."""
    tasks = dataset.get_task_names()
    assert len(tasks) == len(task_dirs)
    log("Splitting multitask dataset into singletask datasets", dataset.verbosity)
    task_metadata_rows = {task: [] for task in tasks}
    for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()):
      log("Processing shard %d" % shard_num, dataset.verbosity)
      basename = "dataset-%d" % shard_num
      for task_num, task in enumerate(tasks):
        log("\tTask %s" % task, dataset.verbosity)
        w_task = w[:, task_num]
        y_task = y[:, task_num]

        # Extract those datapoints which are present for this task
        X_nonzero = X[w_task != 0]
        num_datapoints = X_nonzero.shape[0]
        y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1))
        w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1))
        ids_nonzero = ids[w_task != 0]

        if X_nonzero.size > 0: 
          task_metadata_rows[task].append(
            DiskDataset.write_data_to_disk(
                task_dirs[task_num], basename, [task],
                X_nonzero, y_nonzero, w_nonzero, ids_nonzero))
    
    task_datasets = [
        DiskDataset(data_dir=task_dirs[task_num],
                metadata_rows=task_metadata_rows[task],
                verbosity=dataset.verbosity)
        for (task_num, task) in enumerate(tasks)]
    return task_datasets
Пример #2
0
    def test_singletask_to_multitask_sklearn_hyperparam_opt(self):
        """Test of hyperparam_opt with singletask_to_multitask."""
        tasks = [
            "task0", "task1", "task2", "task3", "task4", "task5", "task6",
            "task7", "task8", "task9", "task10", "task11", "task12", "task13",
            "task14", "task15", "task16"
        ]
        input_file = "multitask_example.csv"

        n_features = 10
        n_tasks = len(tasks)
        # Define train dataset
        n_train = 100
        X_train = np.random.rand(n_train, n_features)
        y_train = np.random.randint(2, size=(n_train, n_tasks))
        w_train = np.ones_like(y_train)
        ids_train = ["C"] * n_train

        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train, w_train, ids_train,
                                               tasks)

        # Define validation dataset
        n_valid = 10
        X_valid = np.random.rand(n_valid, n_features)
        y_valid = np.random.randint(2, size=(n_valid, n_tasks))
        w_valid = np.ones_like(y_valid)
        ids_valid = ["C"] * n_valid
        valid_dataset = DiskDataset.from_numpy(self.valid_dir, X_valid,
                                               y_valid, w_valid, ids_valid,
                                               tasks)

        transformers = []
        classification_metric = Metric(metrics.matthews_corrcoef,
                                       np.mean,
                                       mode="classification")
        params_dict = {"n_estimators": [1, 10]}

        def multitask_model_builder(model_params, model_dir):
            def model_builder(model_dir):
                sklearn_model = RandomForestClassifier(**model_params)
                return SklearnModel(sklearn_model, model_dir)

            return SingletaskToMultitask(tasks, model_builder, model_dir)

        optimizer = HyperparamOpt(multitask_model_builder, verbosity="low")
        best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
            params_dict,
            train_dataset,
            valid_dataset,
            transformers,
            classification_metric,
            logdir=None)
Пример #3
0
    def test_sklearn_multitask_classification(self):
        """Test that sklearn models can learn on simple multitask classification."""
        np.random.seed(123)
        n_tasks = 4
        tasks = range(n_tasks)
        dataset = sklearn.datasets.load_digits(n_class=2)
        X, y = dataset.data, dataset.target
        y = np.reshape(y, (len(y), 1))
        y = np.hstack([y] * n_tasks)

        frac_train = .7
        n_samples = len(X)
        n_train = int(frac_train * n_samples)
        X_train, y_train = X[:n_train], y[:n_train]
        X_test, y_test = X[n_train:], y[n_train:]
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train)
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test)

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)

        def model_builder(model_dir):
            sklearn_model = LogisticRegression()
            return SklearnModel(sklearn_model, model_dir)

        model = SingletaskToMultitask(tasks, model_builder, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        transformers = []
        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [classification_metric])

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])

        for score in scores[classification_metric.name]:
            assert score > .5
Пример #4
0
    def test_sklearn_transformed_regression(self):
        """Test that sklearn models can learn on simple transformed regression datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_diabetes()
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)
        n_train = int(frac_train * n_samples)
        X_train, y_train = X[:n_train], y[:n_train]
        X_test, y_test = X[n_train:], y[n_train:]
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train)
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test)

        # Eval model on train
        transformers = [
            NormalizationTransformer(transform_X=True, dataset=train_dataset),
            ClippingTransformer(transform_X=True, dataset=train_dataset),
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        for data in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(data)

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        sklearn_model = LinearRegression()
        model = SklearnModel(sklearn_model, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [regression_metric])
        assert train_scores[regression_metric.name] > .5

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])
        assert scores[regression_metric.name] > .5
    def test_singletask_to_multitask_classification(self):
        n_features = 10
        n_tasks = 17
        tasks = range(n_tasks)
        # Define train dataset
        n_train = 100
        X_train = np.random.rand(n_train, n_features)
        y_train = np.random.randint(2, size=(n_train, n_tasks))
        w_train = np.ones_like(y_train)
        ids_train = ["C"] * n_train
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train, w_train, ids_train)

        # Define test dataset
        n_test = 10
        X_test = np.random.rand(n_test, n_features)
        y_test = np.random.randint(2, size=(n_test, n_tasks))
        w_test = np.ones_like(y_test)
        ids_test = ["C"] * n_test
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test,
                                              w_test, ids_test)

        transformers = []
        classification_metrics = [Metric(metrics.roc_auc_score)]

        def model_builder(model_dir):
            sklearn_model = LogisticRegression()
            return SklearnModel(sklearn_model, model_dir)

        multitask_model = SingletaskToMultitask(tasks, model_builder,
                                                self.model_dir)

        # Fit trained model
        multitask_model.fit(train_dataset)
        multitask_model.save()

        # Eval multitask_model on train
        evaluator = Evaluator(multitask_model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)

        # Eval multitask_model on test
        evaluator = Evaluator(multitask_model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)
Пример #6
0
    def test_singletask_stratified_k_fold_split(self):
        """
    Test RandomStratifiedSplitter k-fold class.
    """
        n_samples = 100
        n_positives = 20
        n_features = 10
        n_tasks = 1

        X = np.random.rand(n_samples, n_features)
        y = np.zeros(n_samples)
        y[:n_positives] = 1
        w = np.ones(n_samples)
        ids = np.arange(n_samples)

        data_dir = tempfile.mkdtemp()
        dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)

        stratified_splitter = RandomStratifiedSplitter()
        ids_set = set(dataset.ids)

        K = 5
        fold_dirs = [tempfile.mkdtemp() for i in range(K)]
        fold_datasets = stratified_splitter.k_fold_split(dataset, fold_dirs)

        for fold in range(K):
            fold_dataset = fold_datasets[fold]
            # Verify lengths is 100/k == 20
            # Note: This wouldn't work for multitask str
            # assert len(fold_dataset) == n_samples/K
            fold_labels = fold_dataset.y
            # Verify that each fold has n_positives/K = 4 positive examples.
            assert np.count_nonzero(fold_labels == 1) == n_positives / K
            # Verify that compounds in this fold are subset of original compounds
            fold_ids_set = set(fold_dataset.ids)
            assert fold_ids_set.issubset(ids_set)
            # Verify that no two folds have overlapping compounds.
            for other_fold in range(K):
                if fold == other_fold:
                    continue
                other_fold_dataset = fold_datasets[other_fold]
                other_fold_ids_set = set(other_fold_dataset.ids)
                assert fold_ids_set.isdisjoint(other_fold_ids_set)

        merge_dir = tempfile.mkdtemp()
        merged_dataset = DiskDataset.merge(merge_dir, fold_datasets)
        assert len(merged_dataset) == len(dataset)
        assert sorted(merged_dataset.ids) == (sorted(dataset.ids))
Пример #7
0
  def test_samples_move(self):
    """Test that featurized samples can be moved and reloaded."""
    verbosity = "high"
    data_dir = os.path.join(self.base_dir, "data")
    moved_data_dir = os.path.join(self.base_dir, "moved_data")
    dataset_file = os.path.join(
        self.current_dir, "example.csv")

    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    loader = DataLoader(tasks=tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    featurized_dataset = loader.featurize(
        dataset_file, data_dir)
    n_dataset = len(featurized_dataset)
  
    # Now perform move
    shutil.move(data_dir, moved_data_dir)

    moved_featurized_dataset = DiskDataset(
        data_dir=moved_data_dir, reload=True)

    assert len(moved_featurized_dataset) == n_dataset
Пример #8
0
    def test_shuffle_shards(self):
        """Test that shuffle_shards works."""
        n_samples = 100
        n_tasks = 10
        n_features = 10

        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.random.randint(2, size=(n_samples, n_tasks))
        ids = np.arange(n_samples)
        dataset = DiskDataset.from_numpy(self.data_dir, X, y, w, ids)
        dataset.reshard(shard_size=10)
        dataset.shuffle_shards()

        X_s, y_s, w_s, ids_s = (dataset.X, dataset.y, dataset.w, dataset.ids)

        assert X_s.shape == X.shape
        assert y_s.shape == y.shape
        assert ids_s.shape == ids.shape
        assert w_s.shape == w.shape

        # The ids should now store the performed permutation. Check that the
        # original dataset is recoverable.
        for i in range(n_samples):
            np.testing.assert_array_equal(X_s[i], X[ids_s[i]])
            np.testing.assert_array_equal(y_s[i], y[ids_s[i]])
            np.testing.assert_array_equal(w_s[i], w[ids_s[i]])
            np.testing.assert_array_equal(ids_s[i], ids[ids_s[i]])
Пример #9
0
    def test_move_load(self):
        """Test that datasets can be moved and loaded."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        data_dir = os.path.join(self.base_dir, "data")
        moved_data_dir = os.path.join(self.base_dir, "moved_data")
        dataset_file = os.path.join(current_dir,
                                    "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir)

        X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
        shutil.move(data_dir, moved_data_dir)

        moved_dataset = DiskDataset(moved_data_dir, reload=True)

        X_moved, y_moved, w_moved, ids_moved = (moved_dataset.X,
                                                moved_dataset.y,
                                                moved_dataset.w,
                                                moved_dataset.ids)

        np.testing.assert_allclose(X, X_moved)
        np.testing.assert_allclose(y, y_moved)
        np.testing.assert_allclose(w, w_moved)
        np.testing.assert_array_equal(ids, ids_moved)
Пример #10
0
    def test_singletask_stratified_split(self):
        """
    Test RandomStratifiedSplitter on a singletask split.
    """
        np.random.seed(2314)
        # Test singletask case.
        n_samples = 20
        n_positives = 10
        n_features = 10
        n_tasks = 1

        X = np.random.rand(n_samples, n_features)
        y = np.zeros((n_samples, n_tasks))
        y[:n_positives] = 1
        w = np.ones((n_samples, n_tasks))
        ids = np.arange(n_samples)
        data_dir = tempfile.mkdtemp()
        dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)

        stratified_splitter = RandomStratifiedSplitter()
        split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()]
        dataset_1, dataset_2 = stratified_splitter.split(dataset,
                                                         split_dirs,
                                                         frac_split=.5)

        # Should have split cleanly in half (picked random seed to ensure this)
        assert len(dataset_1) == 10
        assert len(dataset_2) == 10

        # Check positives are correctly distributed
        y_1 = dataset_1.y
        assert np.count_nonzero(y_1) == n_positives / 2

        y_2 = dataset_2.y
        assert np.count_nonzero(y_2) == n_positives / 2
Пример #11
0
  def test_sklearn_multitask_regression_overfit(self):
    """Test SKLearn singletask-to-multitask overfits tiny regression data."""
    n_tasks = 2
    tasks = ["task%d" % task for task in range(n_tasks)]
    n_samples = 10
    n_features = 3
    
    # Generate dummy dataset
    np.random.seed(123)
    ids = np.arange(n_samples)
    X = np.random.rand(n_samples, n_features)
    y = np.random.rand(n_samples, n_tasks)
    w = np.ones((n_samples, n_tasks))

    dataset = DiskDataset.from_numpy(self.train_dir, X, y, w, ids)

    verbosity = "high"
    regression_metric = Metric(metrics.r2_score, verbosity=verbosity, task_averager=np.mean)
    def model_builder(model_dir):
      sklearn_model = RandomForestRegressor()
      return SklearnModel(sklearn_model, model_dir)
    model = SingletaskToMultitask(tasks, model_builder, self.model_dir)

    # Fit trained model
    model.fit(dataset)
    model.save()

    # Eval model on train
    transformers = []
    evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity)
    scores = evaluator.compute_model_performance([regression_metric])

    assert scores[regression_metric.name] > .7
Пример #12
0
    def test_singletask_random_k_fold_split(self):
        """
    Test singletask RandomSplitter class.
    """
        solubility_dataset = self.load_solubility_data()
        random_splitter = RandomSplitter()
        ids_set = set(solubility_dataset.ids)

        K = 5
        fold_dirs = [tempfile.mkdtemp() for i in range(K)]
        fold_datasets = random_splitter.k_fold_split(solubility_dataset,
                                                     fold_dirs)
        for fold in range(K):
            fold_dataset = fold_datasets[fold]
            # Verify lengths is 10/k == 2
            assert len(fold_dataset) == 2
            # Verify that compounds in this fold are subset of original compounds
            fold_ids_set = set(fold_dataset.ids)
            assert fold_ids_set.issubset(ids_set)
            # Verify that no two folds have overlapping compounds.
            for other_fold in range(K):
                if fold == other_fold:
                    continue
                other_fold_dataset = fold_datasets[other_fold]
                other_fold_ids_set = set(other_fold_dataset.ids)
                assert fold_ids_set.isdisjoint(other_fold_ids_set)

        merge_dir = tempfile.mkdtemp()
        merged_dataset = DiskDataset.merge(merge_dir, fold_datasets)
        assert len(merged_dataset) == len(solubility_dataset)
        assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
Пример #13
0
 def transform(self, dataset, bins):
     """Performs CDF transform on data."""
     X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
     w_t = w
     ids_t = ids
     if self.transform_X:
         X_t = get_cdf_values(X, self.bins)
         y_t = y
     if self.transform_y:
         print("y will not be transformed by CDFTransformer, for now.")
         """
   y_t = get_cdf_values(y,self.bins)
   X_t = X
   """
     # TODO (rbharath): Find a more elegant solution to saving the data?
     shutil.rmtree(dataset.data_dir)
     os.makedirs(dataset.data_dir)
     DiskDataset.from_numpy(dataset.data_dir, X_t, y_t, w_t, ids_t)
Пример #14
0
    def test_sklearn_classification(self):
        """Test that sklearn models can learn on simple classification datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_digits(n_class=2)
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)
        n_train = int(frac_train * n_samples)
        X_train, y_train = X[:n_train], y[:n_train]
        X_test, y_test = X[n_train:], y[n_train:]
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train)
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test)

        verbosity = "high"
        classification_metric = Metric(metrics.roc_auc_score,
                                       verbosity=verbosity)
        sklearn_model = LogisticRegression()
        model = SklearnModel(sklearn_model, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        transformers = []
        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [classification_metric])

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([classification_metric])
        assert scores[classification_metric.name] > .5
Пример #15
0
def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True):
    """Load PDBBind datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"
    model = "logistic"
    regen = False
    neighbor_cutoff = 4
    max_num_neighbors = 10

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")

    # Load PDBBind dataset
    labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
    pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
    tasks = ["-logKd/Ki"]
    print("About to load contents.")
    contents_df = load_pdbbind_labels(labels_file)
    ids = contents_df["PDB code"].values
    y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

    # Define featurizers
    featurizer = NeighborListComplexAtomicCoordinates(max_num_neighbors,
                                                      neighbor_cutoff)

    # Featurize Dataset
    features = []
    for ind, pdb_code in enumerate(ids):
        print("Processing %s" % str(pdb_code))
        pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
        computed_feature = compute_pdbbind_coordinate_features(
            featurizer, pdb_subdir, pdb_code)
        features.append(computed_feature)
    X = np.array(features, dtype - object)
    w = np.ones_like(y)

    dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)
    transformers = []

    return tasks, dataset, transformers
Пример #16
0
    def transform(self, dataset):
        """Performs power transform on data."""
        X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
        w_t = w
        ids_t = ids
        n_powers = len(self.powers)
        if self.transform_X:
            X_t = np.power(X, self.powers[0])
            for i in range(1, n_powers):
                X_t = np.hstack((X_t, np.power(X, self.powers[i])))
            y_t = y
        if self.transform_y:
            print("y will not be transformed by PowerTransformer, for now.")
            """
      y_t = np.power(y, self.powers[0])
      for i in range(1, n_powers):
      	y_t = np.hstack((y_t,np.power(y, self.powers[i])))
      X_t = X
      """

        # TODO (rbharath): Find a more elegant solution to saving the data?
        shutil.rmtree(dataset.data_dir)
        os.makedirs(dataset.data_dir)
        DiskDataset.from_numpy(dataset.data_dir, X_t, y_t, w_t, ids_t)
Пример #17
0
    def test_multitask_data(self):
        """Test that data associated with a tasks stays associated with it."""
        tasks = ["task0", "task1"]
        n_samples = 100
        n_features = 3
        n_tasks = len(tasks)

        # Generate dummy dataset
        ids = np.array(["C"] * n_samples, dtype=object)
        X = np.random.rand(n_samples, n_features)
        y = np.random.randint(2, size=(n_samples, n_tasks))
        w = np.ones((n_samples, n_tasks))

        dataset = DiskDataset.from_numpy(self.train_dir, X, y, w, ids, tasks)
        np.testing.assert_allclose(X, dataset.X)
        np.testing.assert_allclose(y, dataset.y)
        np.testing.assert_allclose(w, dataset.w)
Пример #18
0
    def test_singletask_index_split(self):
        """
    Test singletask RandomSplitter class.
    """
        solubility_dataset = self.load_solubility_data()
        random_splitter = IndexSplitter()
        train_data, valid_data, test_data = \
            random_splitter.train_valid_test_split(
                solubility_dataset,
                self.train_dir, self.valid_dir, self.test_dir,
                frac_train=0.8, frac_valid=0.1, frac_test=0.1)
        assert len(train_data) == 8
        assert len(valid_data) == 1
        assert len(test_data) == 1

        merge_dir = tempfile.mkdtemp()
        merged_dataset = DiskDataset.merge(merge_dir,
                                           [train_data, valid_data, test_data])
        assert sorted(merged_dataset.ids) == (sorted(solubility_dataset.ids))
Пример #19
0
    def test_select(self):
        """Test that dataset select works."""
        num_datapoints = 10
        num_features = 10
        num_tasks = 1
        X = np.random.rand(num_datapoints, num_features)
        y = np.random.randint(2, size=(num_datapoints, num_tasks))
        w = np.ones((num_datapoints, num_tasks))
        ids = np.array(["id"] * num_datapoints)
        dataset = DiskDataset.from_numpy(self.data_dir, X, y, w, ids)

        select_dir = tempfile.mkdtemp()
        indices = [0, 4, 5, 8]
        select_dataset = dataset.select(select_dir, indices)
        X_sel, y_sel, w_sel, ids_sel = (select_dataset.X, select_dataset.y,
                                        select_dataset.w, select_dataset.ids)
        np.testing.assert_array_equal(X[indices], X_sel)
        np.testing.assert_array_equal(y[indices], y_sel)
        np.testing.assert_array_equal(w[indices], w_sel)
        np.testing.assert_array_equal(ids[indices], ids_sel)
        shutil.rmtree(select_dir)
Пример #20
0
  def test_power_X_transformer(self):
    """Test Power transformer on Gaussian normal dataset."""
    gaussian_dataset = self.load_gaussian_cdf_data()
    powers=[1,2,0.5]
    power_transformer = PowerTransformer(transform_X=True, powers=powers)
    X, y, w, ids = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)
    power_transformer.transform(gaussian_dataset)
    gaussian_dataset = DiskDataset(data_dir=gaussian_dataset.data_dir,reload=True)
    X_t, y_t, w_t, ids_t = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)

    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check y is unchanged since this is an X transformer
    np.testing.assert_allclose(y, y_t)
    # Check w is unchanged since this is an X transformer
    np.testing.assert_allclose(w, w_t)
    # Check X is now holding the proper values in each column.
    np.testing.assert_allclose(X, X_t[:,:2])
    np.testing.assert_allclose(np.power(X,2),X_t[:,2:4])
    np.testing.assert_allclose(np.power(X,0.5),X_t[:,4:])
Пример #21
0
  def test_cdf_X_transformer(self):
    """Test CDF transformer on Gaussian normal dataset."""
    target = np.array(np.transpose(np.linspace(0.,1.,1001)))
    target = np.transpose(np.array(np.append([target],[target], axis=0)))
    gaussian_dataset = self.load_gaussian_cdf_data()
    bins=1001
    cdf_transformer = CDFTransformer(transform_X=True, bins=bins)
    X, y, w, ids = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)
    cdf_transformer.transform(gaussian_dataset, bins=bins)
    gaussian_dataset = DiskDataset(data_dir=gaussian_dataset.data_dir,reload=True)
    X_t, y_t, w_t, ids_t = (gaussian_dataset.X,gaussian_dataset.y,gaussian_dataset.w,gaussian_dataset.ids)

    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check y is unchanged since this is an X transformer
    np.testing.assert_allclose(y, y_t)
    # Check w is unchanged since this is an X transformer
    np.testing.assert_allclose(w, w_t)
    # Check X is now holding the proper values when sorted.
    sorted = np.sort(X_t,axis=0)
    np.testing.assert_allclose(sorted, target)
Пример #22
0
    def test_merge(self):
        """Test that datasets can be merged."""
        verbosity = "high"
        current_dir = os.path.dirname(os.path.realpath(__file__))
        first_data_dir = os.path.join(self.base_dir, "first_dataset")
        second_data_dir = os.path.join(self.base_dir, "second_dataset")
        merged_data_dir = os.path.join(self.base_dir, "merged_data")

        dataset_file = os.path.join(current_dir,
                                    "../../models/tests/example.csv")

        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        loader = DataLoader(tasks=tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        first_dataset = loader.featurize(dataset_file, first_data_dir)
        second_dataset = loader.featurize(dataset_file, second_data_dir)

        merged_dataset = DiskDataset.merge(merged_data_dir,
                                           [first_dataset, second_dataset])

        assert len(merged_dataset) == len(first_dataset) + len(second_dataset)
    def test_to_singletask(self):
        """Test that to_singletask works."""
        num_datapoints = 100
        num_features = 10
        num_tasks = 10
        # Generate data
        X = np.random.rand(num_datapoints, num_features)
        y = np.random.randint(2, size=(num_datapoints, num_tasks))
        w = np.random.randint(2, size=(num_datapoints, num_tasks))
        ids = np.array(["id"] * num_datapoints)

        dataset = DiskDataset.from_numpy(self.train_dir, X, y, w, ids)

        task_dirs = []
        try:
            for task in range(num_tasks):
                task_dirs.append(tempfile.mkdtemp())
            singletask_datasets = SingletaskToMultitask._to_singletask(
                dataset, task_dirs)
            for task in range(num_tasks):
                singletask_dataset = singletask_datasets[task]
                X_task, y_task, w_task, ids_task = (singletask_dataset.X,
                                                    singletask_dataset.y,
                                                    singletask_dataset.w,
                                                    singletask_dataset.ids)
                w_nonzero = w[:, task] != 0
                np.testing.assert_array_equal(X_task, X[w_nonzero != 0])
                np.testing.assert_array_equal(y_task.flatten(),
                                              y[:, task][w_nonzero != 0])
                np.testing.assert_array_equal(w_task.flatten(),
                                              w[:, task][w_nonzero != 0])
                np.testing.assert_array_equal(ids_task, ids[w_nonzero != 0])
        finally:
            # Cleanup
            for task_dir in task_dirs:
                shutil.rmtree(task_dir)
Пример #24
0
    def featurize(self,
                  input_files,
                  data_dir,
                  shard_size=8192,
                  num_shards_per_batch=24,
                  worker_pool=None,
                  logging=True,
                  debug=False):
        """Featurize provided files and write to specified location."""
        ############################################################## TIMING
        time1 = time.time()
        ############################################################## TIMING
        log("Loading raw samples now.", self.verbosity)
        log("shard_size: %d" % shard_size, self.verbosity)
        log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity)

        # Allow users to specify a single file for featurization
        if not isinstance(input_files, list):
            input_files = [input_files]

        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

        # Construct partial function to write datasets.
        if not len(input_files):
            return None
        input_type = get_input_type(input_files[0])

        if logging:
            mp.log_to_stderr()
        if worker_pool is None:
            if logging:
                worker_pool = LoggingPool(processes=1)
            else:
                worker_pool = mp.Pool(processes=1)
        log("Spawning workers now.", self.verbosity)
        metadata_rows = []

        def wrap_with_shard_metadata(iterator):
            for item in iterator:
                yield ((self, shard_size, input_type, data_dir), item)

        data_iterator = wrap_with_shard_metadata(
            enumerate(load_data(input_files, shard_size, self.verbosity)))
        # Turns out python map is terrible and exhausts the generator as given.
        # Solution seems to be to to manually pull out N elements from iterator,
        # then to map on only those N elements. BLECH. Python should do a better
        # job here.
        num_batches = 0
        ############################################################## TIMING
        time2 = time.time()
        log("TIMING: pre-map featurization took %0.3f s" % (time2 - time1))
        ############################################################## TIMING
        while True:
            log("About to start processing next batch of shards",
                self.verbosity)
            ############################################################## TIMING
            time1 = time.time()
            ############################################################## TIMING
            iterator = itertools.islice(data_iterator, num_shards_per_batch)
            if not debug:
                batch_metadata = worker_pool.map(featurize_map_function,
                                                 iterator)
            else:
                batch_metadata = []
                for elt in iterator:
                    batch_metadata.append(featurize_map_function(elt))
            ############################################################## TIMING
            time2 = time.time()
            log("TIMING: map call on batch took %0.3f s" % (time2 - time1),
                self.verbosity)
            ############################################################## TIMING
            if batch_metadata:
                metadata_rows.extend(
                    [elt for elt in batch_metadata if elt is not None])
                num_batches += 1
                log(
                    "Featurized %d datapoints\n" %
                    (shard_size * num_shards_per_batch * num_batches),
                    self.verbosity)
            else:
                break
        ############################################################## TIMING
        time1 = time.time()
        ############################################################## TIMING

        # TODO(rbharath): This whole bit with metadata_rows is an awkward way of
        # creating a Dataset. Is there a more elegant solutions?
        dataset = DiskDataset(data_dir=data_dir,
                              metadata_rows=metadata_rows,
                              reload=True,
                              verbosity=self.verbosity)
        ############################################################## TIMING
        time2 = time.time()
        print("TIMING: dataset construction took %0.3f s" % (time2 - time1),
              self.verbosity)
        ############################################################## TIMING
        return dataset
Пример #25
0
def load_tox21(base_dir, reload=True, num_train=7200):
    """Load Tox21 datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train")
    valid_dir = os.path.join(base_dir, "valid")

    # Load Tox21 dataset
    print("About to load Tox21 dataset.")
    dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz")
    dataset = load_from_disk(dataset_file)
    print("Columns of dataset: %s" % str(dataset.columns.values))
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))

    # Featurize Tox21 dataset
    print("About to featurize Tox21 dataset.")
    featurizer = CircularFingerprint(size=1024)
    tox21_tasks = [
        'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
        'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53'
    ]

    if not reload or not os.path.exists(data_dir):
        loader = DataLoader(tasks=tox21_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir, shard_size=8192)
    else:
        dataset = DiskDataset(data_dir, tox21_tasks, reload=True)

    # Initialize transformers
    transformers = [BalancingTransformer(transform_w=True, dataset=dataset)]
    if not reload:
        print("About to transform data")
        for transformer in transformers:
            transformer.transform(dataset)

    X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
    X_train, X_valid = X[:num_train], X[num_train:]
    y_train, y_valid = y[:num_train], y[num_train:]
    w_train, w_valid = w[:num_train], w[num_train:]
    ids_train, ids_valid = ids[:num_train], ids[num_train:]

    train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                           w_train, ids_train, tox21_tasks)
    valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                           w_valid, ids_valid, tox21_tasks)

    return tox21_tasks, (train_dataset, valid_dataset), transformers
Пример #26
0
def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True):
    """Load PDBBind datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"
    model = "logistic"
    regen = False

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")

    # Load PDBBind dataset
    labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
    pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set")
    tasks = ["-logKd/Ki"]
    print("About to load contents.")
    contents_df = load_pdbbind_labels(labels_file)
    ids = contents_df["PDB code"].values
    y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

    # Define featurizers
    grid_featurizer = GridFeaturizer(
        voxel_width=16.0,
        feature_types="voxel_combined",
        # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi
        # causes segfaults.
        #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi",
        #"salt_bridge"], ecfp_power=9, splif_power=9,
        voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"],
        ecfp_power=9,
        splif_power=9,
        parallel=True,
        flatten=True,
        verbosity=verbosity)
    compound_featurizers = [CircularFingerprint(size=1024)]
    complex_featurizers = [grid_featurizer]

    # Featurize Dataset
    features = []
    feature_len = None
    y_inds = []
    for ind, pdb_code in enumerate(ids):
        print("Processing %s" % str(pdb_code))
        pdb_subdir = os.path.join(pdb_subdirs, pdb_code)
        computed_feature = compute_pdbbind_grid_feature(
            compound_featurizers, complex_featurizers, pdb_subdir, pdb_code)
        if feature_len is None:
            feature_len = len(computed_feature)
        if len(computed_feature) != feature_len:
            print("Featurization failed for %s!" % pdb_code)
            continue
        y_inds.append(ind)
        features.append(computed_feature)
    y = y[y_inds]
    X = np.vstack(features)
    w = np.ones_like(y)

    dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids)
    transformers = []

    return tasks, dataset, transformers
Пример #27
0
def load_sweet(base_dir, reload=True, frac_train=.8):
  """Load sweet datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  model = "logistic"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")

  # Load SWEET dataset
  print("About to load SWEET dataset.")
  dataset_file = os.path.join(
      current_dir, "./sweet.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize SWEET dataset
  print("About to featurize SWEET dataset.")
  featurizer = CircularFingerprint(size=1024)
  SWEET_tasks = dataset.columns.values[1:].tolist()

  loader = DataLoader(tasks=SWEET_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = DiskDataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]
  if regen:
    print("About to transform data")
    for transformer in transformers:
        dataset = transformer.transform(dataset)

  X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
  num_tasks = 17
  num_train = frac_train * len(dataset)
  SWEET_tasks = SWEET_tasks[:num_tasks]
  print("Using following tasks")
  print(SWEET_tasks)
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, SWEET_tasks)
  valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, SWEET_tasks)
  
  return SWEET_tasks, (train_dataset, valid_dataset), transformers
Пример #28
0
    def test_multiload(self):
        """Check can re-use featurization for multiple task selections.

    TODO(rbharath): This test seems silly after the recent round of
                    refactoring. Can it be removed?
    """
        # Only for debug!
        np.random.seed(123)

        # Set some global variables up top
        reload = True
        verbosity = "high"

        current_dir = os.path.dirname(os.path.realpath(__file__))
        #Make directories to store the raw and featurized datasets.
        data_dir = os.path.join(self.base_dir, "dataset")
        train_dir = os.path.join(self.base_dir, "train_dataset")
        valid_dir = os.path.join(self.base_dir, "valid_dataset")
        test_dir = os.path.join(self.base_dir, "test_dataset")
        model_dir = os.path.join(self.base_dir, "model")

        # Load dataset
        print("About to load dataset.")
        dataset_file = os.path.join(
            current_dir, "../../models/tests/multitask_example.csv")
        dataset = load_from_disk(dataset_file)
        print("Columns of dataset: %s" % str(dataset.columns.values))
        print("Number of examples in dataset: %s" % str(dataset.shape[0]))

        # Featurize tox21 dataset
        print("About to featurize dataset.")
        featurizer = CircularFingerprint(size=1024)
        all_tasks = ["task%d" % i for i in range(17)]

        ####### Do featurization
        loader = DataLoader(tasks=all_tasks,
                            smiles_field="smiles",
                            featurizer=featurizer,
                            verbosity=verbosity)
        dataset = loader.featurize(dataset_file, data_dir)

        # Do train/valid split.
        X_multi, y_multi, w_multi, ids_multi = (dataset.X, dataset.y,
                                                dataset.w, dataset.ids)

        ####### Do singletask load
        y_tasks, w_tasks, = [], []
        for ind, task in enumerate(all_tasks):
            print("Processing task %s" % task)
            dataset = DiskDataset(data_dir, verbosity=verbosity, reload=reload)

            X_task, y_task, w_task, ids_task = (dataset.X, dataset.y,
                                                dataset.w, dataset.ids)
            y_tasks.append(y_task[:, ind])
            w_tasks.append(w_task[:, ind])

        ################## Do comparison
        for ind, task in enumerate(all_tasks):
            y_multi_task = y_multi[:, ind]
            w_multi_task = w_multi[:, ind]

            y_task = y_tasks[ind]
            w_task = w_tasks[ind]

            np.testing.assert_allclose(y_multi_task.flatten(),
                                       y_task.flatten())
            np.testing.assert_allclose(w_multi_task.flatten(),
                                       w_task.flatten())
Пример #29
0
# REPLACE WITH DOWNLOADED PDBBIND EXAMPLE
pdbbind_dir = "/tmp/deep-docking/datasets/pdbbind"
pdbbind_tasks, dataset, transformers = load_core_pdbbind_grid(
    pdbbind_dir, base_dir)

print("About to perform train/valid/test split.")
num_train = .8 * len(dataset)
X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)

X_train, X_valid = X[:num_train], X[num_train:]
y_train, y_valid = y[:num_train], y[num_train:]
w_train, w_valid = w[:num_train], w[num_train:]
ids_train, ids_valid = ids[:num_train], ids[num_train:]

train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                   w_train, ids_train, pdbbind_tasks)
valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                   w_valid, ids_valid, pdbbind_tasks)

classification_metric = Metric(metrics.pearson_r2_score, verbosity=verbosity,
                               mode="regression")

n_features = dataset.get_data_shape()[0]
tensorflow_model = TensorflowMultiTaskRegressor(
    len(pdbbind_tasks), n_features, model_dir, dropouts=[.25],
    learning_rate=0.0003, weight_init_stddevs=[.1],
    batch_size=64, verbosity=verbosity)
model = TensorflowModel(tensorflow_model, model_dir)

# Fit trained model
model.fit(train_dataset, nb_epoch=20)
Пример #30
0
def load_pcba(base_dir, reload=True, frac_train=.8):
  """Load PCBA datasets. Does not do train/test split"""
  # Set some global variables up top
  reload = True
  verbosity = "high"
  regen = False

  # Create some directories for analysis
  # The base_dir holds the results of all analysis
  if not reload:
    if os.path.exists(base_dir):
      shutil.rmtree(base_dir)
  if not os.path.exists(base_dir):
    os.makedirs(base_dir)
  current_dir = os.path.dirname(os.path.realpath(__file__))
  #Make directories to store the raw and featurized datasets.
  data_dir = os.path.join(base_dir, "dataset")
  train_dir = os.path.join(base_dir, "train_dataset")
  valid_dir = os.path.join(base_dir, "valid_dataset")

  # Load PCBA dataset
  print("About to load PCBA dataset.")
  dataset_file = os.path.join(
      current_dir, "../../datasets/pcba.csv.gz")
  dataset = load_from_disk(dataset_file)
  print("Columns of dataset: %s" % str(dataset.columns.values))
  print("Number of examples in dataset: %s" % str(dataset.shape[0]))

  # Featurize PCBA dataset
  print("About to featurize PCBA dataset.")
  featurizer = CircularFingerprint(size=1024)
  PCBA_tasks = [
      'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457',
      'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469',
      'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688',
      'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242',
      'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546',
      'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676',
      'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294',
      'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349',
      'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947',
      'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339',
      'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842',
      'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317',
      'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590',
      'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233',
      'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171',
      'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288',
      'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635',
      'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104',
      'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979',
      'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553',
      'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709',
      'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881',
      'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899',
      'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
      'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

  loader = DataLoader(tasks=PCBA_tasks,
                      smiles_field="smiles",
                      featurizer=featurizer,
                      verbosity=verbosity)
  if not reload or not os.path.exists(data_dir):
    dataset = loader.featurize(dataset_file, data_dir)
    regen = True
  else:
    dataset = DiskDataset(data_dir, reload=True)

  # Initialize transformers 
  transformers = [
      BalancingTransformer(transform_w=True, dataset=dataset)]

  if regen:
    print("About to transform data")
    for transformer in transformers:
        transformer.transform(dataset)

  print("About to perform train/valid/test split.")
  num_train = frac_train * len(dataset)
  X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
  num_tasks = 120
  PCBA_tasks = PCBA_tasks[:num_tasks]
  print("Using following tasks")
  print(PCBA_tasks)
  X_train, X_valid = X[:num_train], X[num_train:]
  y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
  w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
  ids_train, ids_valid = ids[:num_train], ids[num_train:]

  train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                     w_train, ids_train, PCBA_tasks)
  valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                     w_valid, ids_valid, PCBA_tasks)

  
  return PCBA_tasks, dataset, transformers
Пример #31
0
def load_sweet(base_dir, reload=True, frac_train=.8):
    """Load sweet datasets. Does not do train/test split"""
    # Set some global variables up top
    reload = True
    verbosity = "high"
    model = "logistic"
    regen = False

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train_dataset")
    valid_dir = os.path.join(base_dir, "valid_dataset")

    # Load SWEET dataset
    print("About to load SWEET dataset.")
    dataset_file = os.path.join(current_dir, "./sweet.csv.gz")
    dataset = load_from_disk(dataset_file)
    print("Columns of dataset: %s" % str(dataset.columns.values))
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))

    # Featurize SWEET dataset
    print("About to featurize SWEET dataset.")
    featurizer = CircularFingerprint(size=1024)
    SWEET_tasks = dataset.columns.values[1:].tolist()

    loader = DataLoader(tasks=SWEET_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    if not reload or not os.path.exists(data_dir):
        dataset = loader.featurize(dataset_file, data_dir)
        regen = True
    else:
        dataset = DiskDataset(data_dir, reload=True)

    # Initialize transformers
    transformers = [BalancingTransformer(transform_w=True, dataset=dataset)]
    if regen:
        print("About to transform data")
        for transformer in transformers:
            dataset = transformer.transform(dataset)

    X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids)
    num_tasks = 17
    num_train = frac_train * len(dataset)
    SWEET_tasks = SWEET_tasks[:num_tasks]
    print("Using following tasks")
    print(SWEET_tasks)
    X_train, X_valid = X[:num_train], X[num_train:]
    y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks]
    w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks]
    ids_train, ids_valid = ids[:num_train], ids[num_train:]

    train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train,
                                           w_train, ids_train, SWEET_tasks)
    valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid,
                                           w_valid, ids_valid, SWEET_tasks)

    return SWEET_tasks, (train_dataset, valid_dataset), transformers