示例#1
0
 def test_X_normalization_transformer(self):
   """Tests normalization transformer."""
   solubility_dataset = self.load_solubility_data()
   normalization_transformer = NormalizationTransformer(
       transform_X=True, dataset=solubility_dataset)
   X, y, w, ids = solubility_dataset.to_numpy()
   normalization_transformer.transform(solubility_dataset)
   X_t, y_t, w_t, ids_t = solubility_dataset.to_numpy()
   # Check ids are unchanged.
   for id_elt, id_t_elt in zip(ids, ids_t):
     assert id_elt == id_t_elt
   # Check y is unchanged since this is a X transformer
   np.testing.assert_allclose(y, y_t)
   # Check w is unchanged since this is a y transformer
   np.testing.assert_allclose(w, w_t)
   # Check that X_t has zero mean, unit std.
   #np.set_printoptions(threshold='nan')
   mean = X_t.mean(axis=0)
   assert np.amax(np.abs(mean-np.zeros_like(mean))) < 1e-7
   orig_std_array = X.std(axis=0)
   std_array = X_t.std(axis=0)
   # Entries with zero std are not normalized
   for orig_std, std in zip(orig_std_array, std_array):
     if not np.isclose(orig_std, 0):
       assert np.isclose(std, 1)
示例#2
0
 def test_X_normalization_transformer(self):
   """Tests normalization transformer."""
   solubility_dataset = self.load_solubility_data()
   normalization_transformer = NormalizationTransformer(
       transform_X=True, dataset=solubility_dataset)
   X, y, w, ids = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids)
   normalization_transformer.transform(solubility_dataset)
   X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids)
   # Check ids are unchanged.
   for id_elt, id_t_elt in zip(ids, ids_t):
     assert id_elt == id_t_elt
   # Check y is unchanged since this is a X transformer
   np.testing.assert_allclose(y, y_t)
   # Check w is unchanged since this is a y transformer
   np.testing.assert_allclose(w, w_t)
   # Check that X_t has zero mean, unit std.
   #np.set_printoptions(threshold='nan')
   mean = X_t.mean(axis=0)
   assert np.amax(np.abs(mean-np.zeros_like(mean))) < 1e-7
   orig_std_array = X.std(axis=0)
   std_array = X_t.std(axis=0)
   # Entries with zero std are not normalized
   for orig_std, std in zip(orig_std_array, std_array):
     if not np.isclose(orig_std, 0):
       assert np.isclose(std, 1)
示例#3
0
    def test_sklearn_transformed_regression(self):
        """Test that sklearn models can learn on simple transformed regression datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_diabetes()
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)
        n_train = int(frac_train * n_samples)
        X_train, y_train = X[:n_train], y[:n_train]
        X_test, y_test = X[n_train:], y[n_train:]
        train_dataset = DiskDataset.from_numpy(self.train_dir, X_train,
                                               y_train)
        test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test)

        # Eval model on train
        transformers = [
            NormalizationTransformer(transform_X=True, dataset=train_dataset),
            ClippingTransformer(transform_X=True, dataset=train_dataset),
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        for data in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(data)

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        sklearn_model = LinearRegression()
        model = SklearnModel(sklearn_model, self.model_dir)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [regression_metric])
        assert train_scores[regression_metric.name] > .5

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])
        assert scores[regression_metric.name] > .5
示例#4
0
  def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self):
    """Test of singletask RF RDKIT-descriptor regression API."""
    splittype = "scaffold"
    featurizer = RDKitDescriptors()
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)

    input_transformers = [
        NormalizationTransformer(transform_X=True, dataset=train_dataset),
        ClippingTransformer(transform_X=True, dataset=train_dataset)]
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    regression_metrics = [Metric(metrics.r2_score),
                          Metric(metrics.mean_squared_error),
                          Metric(metrics.mean_absolute_error)]

    sklearn_model = RandomForestRegressor()
    model = SklearnModel(sklearn_model, self.model_dir)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(regression_metrics)
示例#5
0
    def test_singletask_sklearn_rf_ECFP_regression_API(self):
        """Test of singletask RF ECFP regression API."""
        splittype = "scaffold"
        featurizer = CircularFingerprint(size=1024)
        model_params = {}
        tasks = ["log-solubility"]
        task_type = "regression"
        task_types = {task: task_type for task in tasks}
        input_file = os.path.join(self.current_dir, "example.csv")
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        input_transformers = []
        output_transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        transformers = input_transformers + output_transformers
        model_params["data_shape"] = train_dataset.get_data_shape()
        regression_metrics = [
            Metric(metrics.r2_score),
            Metric(metrics.mean_squared_error),
            Metric(metrics.mean_absolute_error)
        ]

        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=RandomForestRegressor())

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        evaluator = Evaluator(model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(regression_metrics)

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(regression_metrics)
示例#6
0
  def test_y_normalization_transformer(self):
    """Tests normalization transformer."""
    solubility_dataset = self.load_solubility_data()
    normalization_transformer = NormalizationTransformer(
        transform_y=True, dataset=solubility_dataset)
    X, y, w, ids = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids)
    normalization_transformer.transform(solubility_dataset)
    X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids)
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a y transformer
    np.testing.assert_allclose(X, X_t)
    # Check w is unchanged since this is a y transformer
    np.testing.assert_allclose(w, w_t)
    # Check that y_t has zero mean, unit std.
    assert np.isclose(y_t.mean(), 0.)
    assert np.isclose(y_t.std(), 1.)

    # Check that untransform does the right thing.
    np.testing.assert_allclose(normalization_transformer.untransform(y_t), y)
示例#7
0
  def test_y_normalization_transformer(self):
    """Tests normalization transformer."""
    solubility_dataset = self.load_solubility_data()
    normalization_transformer = NormalizationTransformer(
        transform_y=True, dataset=solubility_dataset)
    X, y, w, ids = solubility_dataset.to_numpy()
    normalization_transformer.transform(solubility_dataset)
    X_t, y_t, w_t, ids_t = solubility_dataset.to_numpy()
    # Check ids are unchanged.
    for id_elt, id_t_elt in zip(ids, ids_t):
      assert id_elt == id_t_elt
    # Check X is unchanged since this is a y transformer
    np.testing.assert_allclose(X, X_t)
    # Check w is unchanged since this is a y transformer
    np.testing.assert_allclose(w, w_t)
    # Check that y_t has zero mean, unit std.
    assert np.isclose(y_t.mean(), 0.)
    assert np.isclose(y_t.std(), 1.)

    # Check that untransform does the right thing.
    np.testing.assert_allclose(normalization_transformer.untransform(y_t), y)
示例#8
0
  def test_singletask_tf_mlp_ECFP_classification_API(self):
    """Straightforward test of Tensorflow singletask deepchem classification API."""
    n_features = 1024
    featurizer = CircularFingerprint(size=n_features)

    tasks = ["outcome"]
    input_file = os.path.join(self.current_dir, "example_classification.csv")

    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, test_dataset = splitter.train_test_split(
        dataset, self.train_dir, self.test_dir)
    
    transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]

    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)

    classification_metrics = [Metric(metrics.roc_auc_score),
                              Metric(metrics.matthews_corrcoef),
                              Metric(metrics.recall_score),
                              Metric(metrics.accuracy_score)]

    tensorflow_model = TensorflowMultiTaskClassifier(
        len(tasks), n_features, self.model_dir)
    model = TensorflowModel(tensorflow_model, self.model_dir)

    # Fit trained model
    model.fit(train_dataset)
    model.save()

    # Eval model on train
    evaluator = Evaluator(model, train_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)

    # Eval model on test
    evaluator = Evaluator(model, test_dataset, transformers, verbosity=True)
    _ = evaluator.compute_model_performance(classification_metrics)
示例#9
0
    def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self):
        """Test of hyperparam_opt with singletask RF ECFP regression API."""
        featurizer = CircularFingerprint(size=1024)
        tasks = ["log-solubility"]
        input_file = os.path.join(self.current_dir, "example.csv")
        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, self.train_dir, self.valid_dir, self.test_dir)

        transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        for dataset in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(dataset)

        params_dict = {"n_estimators": [10, 100]}
        metric = Metric(metrics.r2_score)

        def rf_model_builder(model_params, model_dir):
            sklearn_model = RandomForestRegressor(**model_params)
            return SklearnModel(sklearn_model, model_dir)

        optimizer = HyperparamOpt(rf_model_builder, verbosity="low")
        best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
            params_dict,
            train_dataset,
            valid_dataset,
            transformers,
            metric,
            logdir=None)
  def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self):
    """Test of hyperparam_opt with singletask RF ECFP regression API."""
    splittype = "scaffold"
    featurizer = CircularFingerprint(size=1024)
    tasks = ["log-solubility"]
    task_type = "regression"
    task_types = {task: task_type for task in tasks}
    input_file = os.path.join(self.current_dir, "example.csv")
    loader = DataLoader(tasks=tasks,
                        smiles_field=self.smiles_field,
                        featurizer=featurizer,
                        verbosity="low")
    dataset = loader.featurize(input_file, self.data_dir)

    splitter = ScaffoldSplitter()
    train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        dataset, self.train_dir, self.valid_dir, self.test_dir)

    input_transformers = []
    output_transformers = [
        NormalizationTransformer(transform_y=True, dataset=train_dataset)]
    transformers = input_transformers + output_transformers
    for dataset in [train_dataset, test_dataset]:
      for transformer in transformers:
        transformer.transform(dataset)
    params_dict = {
      "n_estimators": [10, 100],
      "max_features": ["auto"],
      "data_shape": train_dataset.get_data_shape()
    }
    metric = Metric(metrics.r2_score)

    optimizer = HyperparamOpt(rf_model_builder, tasks, task_types, verbosity="low")
    best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
      params_dict, train_dataset, valid_dataset, output_transformers,
      metric, logdir=None)
示例#11
0
    def test_sklearn_transformed_regression(self):
        """Test that sklearn models can learn on simple transformed regression datasets."""
        np.random.seed(123)
        dataset = sklearn.datasets.load_diabetes()
        X, y = dataset.data, dataset.target

        frac_train = .7
        n_samples = len(X)

        X_train, y_train = X[:frac_train * n_samples], y[:frac_train *
                                                         n_samples]
        X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:]

        train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train)
        test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test)

        # Eval model on train
        input_transformers = [
            NormalizationTransformer(transform_X=True, dataset=train_dataset),
            ClippingTransformer(transform_X=True, dataset=train_dataset)
        ]
        output_transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        transformers = input_transformers + output_transformers
        for transformer in transformers:
            transformer.transform(train_dataset)
        for transformer in transformers:
            transformer.transform(test_dataset)

        tasks = train_dataset.get_task_names()
        task_types = {task: "regression" for task in tasks}

        model_params = {
            "batch_size": None,
            "data_shape": train_dataset.get_data_shape()
        }

        verbosity = "high"
        regression_metric = Metric(metrics.r2_score, verbosity=verbosity)
        model = SklearnModel(tasks,
                             task_types,
                             model_params,
                             self.model_dir,
                             mode="regression",
                             model_instance=LinearRegression())

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        train_evaluator = Evaluator(model,
                                    train_dataset,
                                    transformers,
                                    verbosity=verbosity)
        train_scores = train_evaluator.compute_model_performance(
            [regression_metric])
        print("train_scores")
        print(train_scores)

        assert train_scores[regression_metric.name] > .5

        # Eval model on test
        transformers = []
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=verbosity)
        scores = evaluator.compute_model_performance([regression_metric])
        print("scores")
        print(scores)

        assert scores[regression_metric.name] > .5
示例#12
0
def load_nci(base_dir,
             reload=True,
             force_transform=False,
             shard_size=1000,
             num_shards_per_batch=4):
    """Load NCI datasets. Does not do train/test split"""
    # Set some global variables up top
    verbosity = "high"
    model = "logistic"
    regen = False

    # Create some directories for analysis
    # The base_dir holds the results of all analysis
    if not reload:
        if os.path.exists(base_dir):
            print("Deleting dir in nci_datasets.py")
            print(base_dir)
            shutil.rmtree(base_dir)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    current_dir = os.path.dirname(os.path.realpath(__file__))
    #Make directories to store the raw and featurized datasets.
    data_dir = os.path.join(base_dir, "dataset")

    # Load nci dataset
    print("About to load NCI dataset.")
    dataset_file1_path = os.path.join(current_dir,
                                      "../../datasets/nci_1.csv.gz")
    dataset_file2_path = os.path.join(current_dir,
                                      "../../datasets/nci_2.csv.gz")
    dataset_paths = [dataset_file1_path, dataset_file2_path]
    dataset = load_sharded_csv(dataset_paths)
    print("Columns of dataset: %s" % str(dataset.columns.values))
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))

    # Featurize nci dataset
    print("About to featurize nci dataset.")
    featurizer = CircularFingerprint(size=1024)
    #was sorted list originally in muv_datasets.py, but csv is ordered so removed
    all_nci_tasks = ([
        'CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226', 'SR',
        'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226', 'NCI-H23',
        'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205', 'HCC-2998', 'HCT-116',
        'HCT-15', 'HT29', 'KM12', 'SW-620', 'SF-268', 'SF-295', 'SF-539',
        'SNB-19', 'SNB-75', 'U251', 'LOX IMVI', 'MALME-3M', 'M14',
        'MDA-MB-435', 'SK-MEL-2', 'SK-MEL-28', 'SK-MEL-5', 'UACC-257',
        'UACC-62', 'IGR-OV1', 'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8',
        'NCI/ADR-RES', 'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393',
        'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7', 'MDA-MB-231/ATCC',
        'MDA-MB-468', 'HS 578T', 'BT-549', 'T-47D'
    ])

    loader = DataLoader(tasks=all_nci_tasks,
                        smiles_field="smiles",
                        featurizer=featurizer,
                        verbosity=verbosity)
    if not reload or not os.path.exists(data_dir):
        dataset = loader.featurize(dataset_paths,
                                   data_dir,
                                   shard_size=shard_size,
                                   num_shards_per_batch=num_shards_per_batch)
        regen = True
    else:
        dataset = Dataset(data_dir, reload=True)

    # Initialize transformers
    transformers = []
    if regen or force_transform:
        print("About to transform data")
        transformers = [
            NormalizationTransformer(transform_y=True, dataset=dataset)
        ]
        for transformer in transformers:
            transformer.transform(dataset)

    return all_nci_tasks, dataset, transformers
示例#13
0
    def test_singletask_tf_mlp_ECFP_classification_API(self):
        """Straightforward test of Tensorflow singletask deepchem classification API."""
        splittype = "scaffold"
        output_transformers = []
        input_transformers = []
        task_type = "classification"

        featurizer = CircularFingerprint(size=1024)

        tasks = ["outcome"]
        task_type = "classification"
        task_types = {task: task_type for task in tasks}
        input_file = os.path.join(self.current_dir,
                                  "example_classification.csv")

        loader = DataLoader(tasks=tasks,
                            smiles_field=self.smiles_field,
                            featurizer=featurizer,
                            verbosity="low")
        dataset = loader.featurize(input_file, self.data_dir)

        splitter = ScaffoldSplitter()
        train_dataset, test_dataset = splitter.train_test_split(
            dataset, self.train_dir, self.test_dir)

        input_transformers = []
        output_transformers = [
            NormalizationTransformer(transform_y=True, dataset=train_dataset)
        ]
        transformers = input_transformers + output_transformers

        for dataset in [train_dataset, test_dataset]:
            for transformer in transformers:
                transformer.transform(dataset)

        model_params = {
            "batch_size": 2,
            "num_classification_tasks": 1,
            "num_features": 1024,
            "layer_sizes": [1024],
            "weight_init_stddevs": [1.],
            "bias_init_consts": [0.],
            "dropouts": [.5],
            "num_classes": 2,
            "nb_epoch": 1,
            "penalty": 0.0,
            "optimizer": "adam",
            "learning_rate": .001,
            "data_shape": train_dataset.get_data_shape()
        }
        classification_metrics = [
            Metric(metrics.roc_auc_score),
            Metric(metrics.matthews_corrcoef),
            Metric(metrics.recall_score),
            Metric(metrics.accuracy_score)
        ]

        model = TensorflowModel(tasks,
                                task_types,
                                model_params,
                                self.model_dir,
                                tf_class=TensorflowMultiTaskClassifier)

        # Fit trained model
        model.fit(train_dataset)
        model.save()

        # Eval model on train
        evaluator = Evaluator(model,
                              train_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)

        # Eval model on test
        evaluator = Evaluator(model,
                              test_dataset,
                              transformers,
                              verbosity=True)
        _ = evaluator.compute_model_performance(classification_metrics)
示例#14
0
def load_bace(mode="regression", transform=True, split="20-80"):
    """Load BACE-1 dataset as regression/classification problem."""
    reload = True
    verbosity = "high"
    regen = False
    assert split in ["20-80", "80-20"]

    current_dir = os.path.dirname(os.path.realpath(__file__))
    if split == "20-80":
        dataset_file = os.path.join(current_dir,
                                    "../../datasets/desc_canvas_aug30.csv")
    elif split == "80-20":
        dataset_file = os.path.join(current_dir,
                                    "../../datasets/rev8020split_desc.csv")
    dataset = load_from_disk(dataset_file)
    num_display = 10
    pretty_columns = ("[" + ",".join(
        ["'%s'" % column
         for column in dataset.columns.values[:num_display]]) + ",...]")

    crystal_dataset_file = os.path.join(
        current_dir, "../../datasets/crystal_desc_canvas_aug30.csv")
    crystal_dataset = load_from_disk(crystal_dataset_file)

    print("Columns of dataset: %s" % pretty_columns)
    print("Number of examples in dataset: %s" % str(dataset.shape[0]))
    print("Number of examples in crystal dataset: %s" %
          str(crystal_dataset.shape[0]))

    #Make directories to store the raw and featurized datasets.
    base_dir = tempfile.mkdtemp()
    data_dir = os.path.join(base_dir, "dataset")
    train_dir = os.path.join(base_dir, "train_dataset")
    valid_dir = os.path.join(base_dir, "valid_dataset")
    test_dir = os.path.join(base_dir, "test_dataset")
    model_dir = os.path.join(base_dir, "model")
    crystal_dir = os.path.join(base_dir, "crystal")

    if mode == "regression":
        bace_tasks = ["pIC50"]
    elif mode == "classification":
        bace_tasks = ["Class"]
    else:
        raise ValueError("Unknown mode %s" % mode)
    featurizer = UserDefinedFeaturizer(user_specified_features)
    loader = DataLoader(tasks=bace_tasks,
                        smiles_field="mol",
                        id_field="CID",
                        featurizer=featurizer)
    if not reload or not os.path.exists(data_dir):
        dataset = loader.featurize(dataset_file, data_dir)
        regen = True
    else:
        dataset = Dataset(data_dir, reload=True)
    if not reload or not os.path.exists(crystal_dir):
        crystal_dataset = loader.featurize(crystal_dataset_file, crystal_dir)
    else:
        crystal_dataset = Dataset(crystal_dir, reload=True)

    if (not reload or not os.path.exists(train_dir)
            or not os.path.exists(valid_dir) or not os.path.exists(test_dir)):
        regen = True
        splitter = SpecifiedSplitter(dataset_file,
                                     "Model",
                                     verbosity=verbosity)
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset, train_dir, valid_dir, test_dir)
    else:
        train_dataset = Dataset(train_dir, reload=True)
        valid_dataset = Dataset(valid_dir, reload=True)
        test_dataset = Dataset(test_dir, reload=True)

    #NOTE THE RENAMING:
    if split == "20-80":
        valid_dataset, test_dataset = test_dataset, valid_dataset
    print("Number of compounds in train set")
    print(len(train_dataset))
    print("Number of compounds in validation set")
    print(len(valid_dataset))
    print("Number of compounds in test set")
    print(len(test_dataset))
    print("Number of compounds in crystal set")
    print(len(crystal_dataset))

    if transform and regen:
        input_transformers = [
            NormalizationTransformer(transform_X=True, dataset=train_dataset),
            ClippingTransformer(transform_X=True, dataset=train_dataset)
        ]
        output_transformers = []
        if mode == "regression":
            output_transformers = [
                NormalizationTransformer(transform_y=True,
                                         dataset=train_dataset)
            ]
        else:
            output_transformers = []
    else:
        input_transformers, output_transformers = [], []

    transformers = input_transformers + output_transformers
    for dataset in [
            train_dataset, valid_dataset, test_dataset, crystal_dataset
    ]:
        for transformer in transformers:
            transformer.transform(dataset)

    return (bace_tasks, train_dataset, valid_dataset, test_dataset,
            crystal_dataset, output_transformers)