def test_sklearn_multitask_regression_overfit(self): """Test SKLearn singletask-to-multitask overfits tiny regression data.""" n_tasks = 2 tasks = ["task%d" % task for task in range(n_tasks)] n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.rand(n_samples, n_tasks) w = np.ones((n_samples, n_tasks)) dataset = DiskDataset.from_numpy(self.train_dir, X, y, w, ids) verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity, task_averager=np.mean) def model_builder(model_dir): sklearn_model = RandomForestRegressor() return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(tasks, model_builder, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .7
def test_sklearn_multitask_classification_overfit(self): """Test SKLearn singletask-to-multitask overfits tiny data.""" n_tasks = 10 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=RandomForestClassifier(), verbosity=verbosity) model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_singletask_to_multitask_classification(self): splittype = "scaffold" compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [] output_transformers = [] tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: "classification" for task in tasks} input_file = "multitask_example.csv" n_features = 10 n_tasks = len(tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train, tasks) # Define test dataset n_test = 10 X_test = np.random.rand(n_test, n_features) y_test = np.random.randint(2, size=(n_test, n_tasks)) w_test = np.ones_like(y_test) ids_test = ["C"] * n_test test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test, w_test, ids_test, tasks) params_dict = { "batch_size": 32, "data_shape": train_dataset.get_data_shape() } classification_metrics = [Metric(metrics.roc_auc_score)] def model_builder(tasks, task_types, model_params, model_builder, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_builder, model_instance=LogisticRegression()) multitask_model = SingletaskToMultitask(tasks, task_types, params_dict, self.model_dir, model_builder) # Fit trained model multitask_model.fit(train_dataset) multitask_model.save() # Eval multitask_model on train evaluator = Evaluator(multitask_model, train_dataset, output_transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval multitask_model on test evaluator = Evaluator(multitask_model, test_dataset, output_transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_sklearn_multitask_classification(self): """Test that sklearn models can learn on simple multitask classification.""" np.random.seed(123) n_tasks = 4 dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target y = np.reshape(y, (len(y), 1)) y = np.hstack([y] * n_tasks) frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train*n_samples], y[:frac_train*n_samples] X_test, y_test = X[frac_train*n_samples:], y[frac_train*n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) tasks = train_dataset.get_task_names() task_types = {task: "classification" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=LogisticRegression(), verbosity=verbosity) model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([classification_metric]) print("train_scores") print(train_scores) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) print("scores") print(scores) for score in scores[classification_metric.name]: assert score > .5
def test_sklearn_multitask_classification(self): """Test that sklearn models can learn on simple multitask classification.""" np.random.seed(123) n_tasks = 4 tasks = range(n_tasks) dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target y = np.reshape(y, (len(y), 1)) y = np.hstack([y] * n_tasks) frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(model_dir): sklearn_model = LogisticRegression() return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(tasks, model_builder, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) for score in scores[classification_metric.name]: assert score > .5
def test_singletask_to_multitask_classification(self): n_features = 10 n_tasks = 17 tasks = range(n_tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train) # Define test dataset n_test = 10 X_test = np.random.rand(n_test, n_features) y_test = np.random.randint(2, size=(n_test, n_tasks)) w_test = np.ones_like(y_test) ids_test = ["C"] * n_test test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test, w_test, ids_test) transformers = [] classification_metrics = [Metric(metrics.roc_auc_score)] def model_builder(model_dir): sklearn_model = LogisticRegression() return SklearnModel(sklearn_model, model_dir) multitask_model = SingletaskToMultitask(tasks, model_builder, self.model_dir) # Fit trained model multitask_model.fit(train_dataset) multitask_model.save() # Eval multitask_model on train evaluator = Evaluator(multitask_model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval multitask_model on test evaluator = Evaluator(multitask_model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_sklearn_multitask_regression_overfit(self): """Test SKLearn singletask-to-multitask overfits tiny regression data.""" n_tasks = 2 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.rand(n_samples, n_tasks) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "batch_size": None, "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="regression", model_instance=RandomForestRegressor(), verbosity=verbosity) model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .7
print("About to perform train/valid/test split.") splitter = RandomSplitter(verbosity=verbosity) print("Performing new split.") train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( nci_dataset, train_dir, valid_dir, test_dir) classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") def model_builder(model_dir): sklearn_model = RandomForestRegressor(n_estimators=500) return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(nci_tasks, model_builder, model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([classification_metric]) print("Train scores") print(train_scores) valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity) valid_scores = valid_evaluator.compute_model_performance([classification_metric]) print("Validation scores") print(valid_scores)
model_params, model_dir, model_instance=RandomForestClassifier( class_weight="balanced", n_estimators=500), verbosity=verbosity) model = SingletaskToMultitask(tox21_tasks, tox21_task_types, params_dict, model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("Train scores") print(train_scores) valid_evaluator = Evaluator(model, valid_dataset, transformers,
def test_sklearn_multitask_classification(self): """Test that sklearn models can learn on simple multitask classification.""" np.random.seed(123) n_tasks = 4 dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target y = np.reshape(y, (len(y), 1)) y = np.hstack([y] * n_tasks) frac_train = .7 n_samples = len(X) X_train, y_train = X[:frac_train * n_samples], y[:frac_train * n_samples] X_test, y_test = X[frac_train * n_samples:], y[frac_train * n_samples:] train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test) tasks = train_dataset.get_task_names() task_types = {task: "classification" for task in tasks} model_params = { "batch_size": None, "data_shape": train_dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(tasks, task_types, model_params, model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_dir, mode="classification", model_instance=LogisticRegression(), verbosity=verbosity) model = SingletaskToMultitask(tasks, task_types, model_params, self.model_dir, model_builder, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("train_scores") print(train_scores) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) print("scores") print(scores) for score in scores[classification_metric.name]: assert score > .5
def test_singletask_to_multitask_classification(self): splittype = "scaffold" compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [] output_transformers = [] tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] task_types = {task: "classification" for task in tasks} input_file = "multitask_example.csv" n_features = 10 n_tasks = len(tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train, tasks) # Define test dataset n_test = 10 X_test = np.random.rand(n_test, n_features) y_test = np.random.randint(2, size=(n_test, n_tasks)) w_test = np.ones_like(y_test) ids_test = ["C"] * n_test test_dataset = Dataset.from_numpy(self.test_dir, X_test, y_test, w_test, ids_test, tasks) params_dict = { "batch_size": 32, "data_shape": train_dataset.get_data_shape() } classification_metrics = [Metric(metrics.roc_auc_score)] def model_builder(tasks, task_types, model_params, model_builder, verbosity=None): return SklearnModel(tasks, task_types, model_params, model_builder, model_instance=LogisticRegression()) multitask_model = SingletaskToMultitask(tasks, task_types, params_dict, self.model_dir, model_builder) # Fit trained model multitask_model.fit(train_dataset) multitask_model.save() # Eval multitask_model on train evaluator = Evaluator(multitask_model, train_dataset, output_transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval multitask_model on test evaluator = Evaluator(multitask_model, test_dataset, output_transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
tox_test), tox_transformers = dc.molnet.load_tox21() classification_metric = Metric( metrics.roc_auc_score, np.mean, mode="classification") def model_builder(model_dir): sklearn_model = RandomForestClassifier( class_weight="balanced", n_estimators=500, n_jobs=-1) return dc.models.SklearnModel(sklearn_model, model_dir) print(tox_train.get_task_names()) print(tox_tasks) tox_model = SingletaskToMultitask(tox_tasks, model_builder) tox_model.fit(tox_train) # Load sider models now sider_tasks, ( sider_train, sider_valid, sider_test), sider_transformers = dc.molnet.load_sider(split="random") sider_model = SingletaskToMultitask(sider_tasks, model_builder) sider_model.fit(sider_train) # Load sweetlead dataset now. Pass in dataset object and appropriate # transformers to predict functions sweet_tasks, (sweet_dataset, _, _), sweet_transformers = dc.molnet.load_sweet()