def test_X_normalization_transformer(self): """Tests normalization transformer.""" solubility_dataset = self.load_solubility_data() normalization_transformer = NormalizationTransformer( transform_X=True, dataset=solubility_dataset) X, y, w, ids = solubility_dataset.to_numpy() normalization_transformer.transform(solubility_dataset) X_t, y_t, w_t, ids_t = solubility_dataset.to_numpy() # Check ids are unchanged. for id_elt, id_t_elt in zip(ids, ids_t): assert id_elt == id_t_elt # Check y is unchanged since this is a X transformer np.testing.assert_allclose(y, y_t) # Check w is unchanged since this is a y transformer np.testing.assert_allclose(w, w_t) # Check that X_t has zero mean, unit std. #np.set_printoptions(threshold='nan') mean = X_t.mean(axis=0) assert np.amax(np.abs(mean-np.zeros_like(mean))) < 1e-7 orig_std_array = X.std(axis=0) std_array = X_t.std(axis=0) # Entries with zero std are not normalized for orig_std, std in zip(orig_std_array, std_array): if not np.isclose(orig_std, 0): assert np.isclose(std, 1)
def test_X_normalization_transformer(self): """Tests normalization transformer.""" solubility_dataset = self.load_solubility_data() normalization_transformer = NormalizationTransformer( transform_X=True, dataset=solubility_dataset) X, y, w, ids = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids) normalization_transformer.transform(solubility_dataset) X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids) # Check ids are unchanged. for id_elt, id_t_elt in zip(ids, ids_t): assert id_elt == id_t_elt # Check y is unchanged since this is a X transformer np.testing.assert_allclose(y, y_t) # Check w is unchanged since this is a y transformer np.testing.assert_allclose(w, w_t) # Check that X_t has zero mean, unit std. #np.set_printoptions(threshold='nan') mean = X_t.mean(axis=0) assert np.amax(np.abs(mean-np.zeros_like(mean))) < 1e-7 orig_std_array = X.std(axis=0) std_array = X_t.std(axis=0) # Entries with zero std are not normalized for orig_std, std in zip(orig_std_array, std_array): if not np.isclose(orig_std, 0): assert np.isclose(std, 1)
def test_sklearn_transformed_regression(self): """Test that sklearn models can learn on simple transformed regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test) # Eval model on train transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset), NormalizationTransformer(transform_y=True, dataset=train_dataset) ] for data in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(data) verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) sklearn_model = LinearRegression() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [regression_metric]) assert train_scores[regression_metric.name] > .5 # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .5
def test_singletask_sklearn_rf_RDKIT_descriptor_regression_API(self): """Test of singletask RF RDKIT-descriptor regression API.""" splittype = "scaffold" featurizer = RDKitDescriptors() tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset)] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) regression_metrics = [Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error)] sklearn_model = RandomForestRegressor() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_singletask_sklearn_rf_ECFP_regression_API(self): """Test of singletask RF ECFP regression API.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) model_params = {} tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] transformers = input_transformers + output_transformers model_params["data_shape"] = train_dataset.get_data_shape() regression_metrics = [ Metric(metrics.r2_score), Metric(metrics.mean_squared_error), Metric(metrics.mean_absolute_error) ] model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=RandomForestRegressor()) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(regression_metrics)
def test_y_normalization_transformer(self): """Tests normalization transformer.""" solubility_dataset = self.load_solubility_data() normalization_transformer = NormalizationTransformer( transform_y=True, dataset=solubility_dataset) X, y, w, ids = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids) normalization_transformer.transform(solubility_dataset) X_t, y_t, w_t, ids_t = (solubility_dataset.X, solubility_dataset.y, solubility_dataset.w, solubility_dataset.ids) # Check ids are unchanged. for id_elt, id_t_elt in zip(ids, ids_t): assert id_elt == id_t_elt # Check X is unchanged since this is a y transformer np.testing.assert_allclose(X, X_t) # Check w is unchanged since this is a y transformer np.testing.assert_allclose(w, w_t) # Check that y_t has zero mean, unit std. assert np.isclose(y_t.mean(), 0.) assert np.isclose(y_t.std(), 1.) # Check that untransform does the right thing. np.testing.assert_allclose(normalization_transformer.untransform(y_t), y)
def test_y_normalization_transformer(self): """Tests normalization transformer.""" solubility_dataset = self.load_solubility_data() normalization_transformer = NormalizationTransformer( transform_y=True, dataset=solubility_dataset) X, y, w, ids = solubility_dataset.to_numpy() normalization_transformer.transform(solubility_dataset) X_t, y_t, w_t, ids_t = solubility_dataset.to_numpy() # Check ids are unchanged. for id_elt, id_t_elt in zip(ids, ids_t): assert id_elt == id_t_elt # Check X is unchanged since this is a y transformer np.testing.assert_allclose(X, X_t) # Check w is unchanged since this is a y transformer np.testing.assert_allclose(w, w_t) # Check that y_t has zero mean, unit std. assert np.isclose(y_t.mean(), 0.) assert np.isclose(y_t.std(), 1.) # Check that untransform does the right thing. np.testing.assert_allclose(normalization_transformer.untransform(y_t), y)
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" n_features = 1024 featurizer = CircularFingerprint(size=n_features) tasks = ["outcome"] input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] tensorflow_model = TensorflowMultiTaskClassifier( len(tasks), n_features, self.model_dir) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): """Test of hyperparam_opt with singletask RF ECFP regression API.""" featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) params_dict = {"n_estimators": [10, 100]} metric = Metric(metrics.r2_score) def rf_model_builder(model_params, model_dir): sklearn_model = RandomForestRegressor(**model_params) return SklearnModel(sklearn_model, model_dir) optimizer = HyperparamOpt(rf_model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): """Test of hyperparam_opt with singletask RF ECFP regression API.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) params_dict = { "n_estimators": [10, 100], "max_features": ["auto"], "data_shape": train_dataset.get_data_shape() } metric = Metric(metrics.r2_score) optimizer = HyperparamOpt(rf_model_builder, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, output_transformers, metric, logdir=None)
def test_sklearn_transformed_regression(self): """Test that sklearn models can learn on simple transformed regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train * n_samples], y[:frac_train * n_samples] X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) # Eval model on train input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset) ] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] transformers = input_transformers + output_transformers for transformer in transformers: transformer.transform(train_dataset) for transformer in transformers: transformer.transform(test_dataset) tasks = train_dataset.get_task_names() task_types = {task: "regression" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = SklearnModel(tasks, task_types, model_params, self.model_dir, mode="regression", model_instance=LinearRegression()) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [regression_metric]) print("train_scores") print(train_scores) assert train_scores[regression_metric.name] > .5 # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) print("scores") print(scores) assert scores[regression_metric.name] > .5
def load_nci(base_dir, reload=True, force_transform=False, shard_size=1000, num_shards_per_batch=4): """Load NCI datasets. Does not do train/test split""" # Set some global variables up top verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): print("Deleting dir in nci_datasets.py") print(base_dir) shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load nci dataset print("About to load NCI dataset.") dataset_file1_path = os.path.join(current_dir, "../../datasets/nci_1.csv.gz") dataset_file2_path = os.path.join(current_dir, "../../datasets/nci_2.csv.gz") dataset_paths = [dataset_file1_path, dataset_file2_path] dataset = load_sharded_csv(dataset_paths) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize nci dataset print("About to featurize nci dataset.") featurizer = CircularFingerprint(size=1024) #was sorted list originally in muv_datasets.py, but csv is ordered so removed all_nci_tasks = ([ 'CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226', 'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226', 'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205', 'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620', 'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251', 'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2', 'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1', 'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES', 'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393', 'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7', 'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549', 'T-47D' ]) loader = DataLoader(tasks=all_nci_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_paths, data_dir, shard_size=shard_size, num_shards_per_batch=num_shards_per_batch) regen = True else: dataset = Dataset(data_dir, reload=True) # Initialize transformers transformers = [] if regen or force_transform: print("About to transform data") transformers = [ NormalizationTransformer(transform_y=True, dataset=dataset) ] for transformer in transformers: transformer.transform(dataset) return all_nci_tasks, dataset, transformers
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" splittype = "scaffold" output_transformers = [] input_transformers = [] task_type = "classification" featurizer = CircularFingerprint(size=1024) tasks = ["outcome"] task_type = "classification" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) model_params = { "batch_size": 2, "num_classification_tasks": 1, "num_features": 1024, "layer_sizes": [1024], "weight_init_stddevs": [1.], "bias_init_consts": [0.], "dropouts": [.5], "num_classes": 2, "nb_epoch": 1, "penalty": 0.0, "optimizer": "adam", "learning_rate": .001, "data_shape": train_dataset.get_data_shape() } classification_metrics = [ Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score) ] model = TensorflowModel(tasks, task_types, model_params, self.model_dir, tf_class=TensorflowMultiTaskClassifier) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def load_bace(mode="regression", transform=True, split="20-80"): """Load BACE-1 dataset as regression/classification problem.""" reload = True verbosity = "high" regen = False assert split in ["20-80", "80-20"] current_dir = os.path.dirname(os.path.realpath(__file__)) if split == "20-80": dataset_file = os.path.join(current_dir, "../../datasets/desc_canvas_aug30.csv") elif split == "80-20": dataset_file = os.path.join(current_dir, "../../datasets/rev8020split_desc.csv") dataset = load_from_disk(dataset_file) num_display = 10 pretty_columns = ("[" + ",".join( ["'%s'" % column for column in dataset.columns.values[:num_display]]) + ",...]") crystal_dataset_file = os.path.join( current_dir, "../../datasets/crystal_desc_canvas_aug30.csv") crystal_dataset = load_from_disk(crystal_dataset_file) print("Columns of dataset: %s" % pretty_columns) print("Number of examples in dataset: %s" % str(dataset.shape[0])) print("Number of examples in crystal dataset: %s" % str(crystal_dataset.shape[0])) #Make directories to store the raw and featurized datasets. base_dir = tempfile.mkdtemp() data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") test_dir = os.path.join(base_dir, "test_dataset") model_dir = os.path.join(base_dir, "model") crystal_dir = os.path.join(base_dir, "crystal") if mode == "regression": bace_tasks = ["pIC50"] elif mode == "classification": bace_tasks = ["Class"] else: raise ValueError("Unknown mode %s" % mode) featurizer = UserDefinedFeaturizer(user_specified_features) loader = DataLoader(tasks=bace_tasks, smiles_field="mol", id_field="CID", featurizer=featurizer) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = Dataset(data_dir, reload=True) if not reload or not os.path.exists(crystal_dir): crystal_dataset = loader.featurize(crystal_dataset_file, crystal_dir) else: crystal_dataset = Dataset(crystal_dir, reload=True) if (not reload or not os.path.exists(train_dir) or not os.path.exists(valid_dir) or not os.path.exists(test_dir)): regen = True splitter = SpecifiedSplitter(dataset_file, "Model", verbosity=verbosity) train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, train_dir, valid_dir, test_dir) else: train_dataset = Dataset(train_dir, reload=True) valid_dataset = Dataset(valid_dir, reload=True) test_dataset = Dataset(test_dir, reload=True) #NOTE THE RENAMING: if split == "20-80": valid_dataset, test_dataset = test_dataset, valid_dataset print("Number of compounds in train set") print(len(train_dataset)) print("Number of compounds in validation set") print(len(valid_dataset)) print("Number of compounds in test set") print(len(test_dataset)) print("Number of compounds in crystal set") print(len(crystal_dataset)) if transform and regen: input_transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset) ] output_transformers = [] if mode == "regression": output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] else: output_transformers = [] else: input_transformers, output_transformers = [], [] transformers = input_transformers + output_transformers for dataset in [ train_dataset, valid_dataset, test_dataset, crystal_dataset ]: for transformer in transformers: transformer.transform(dataset) return (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset, output_transformers)