def _hyperparam_opt(self, model_builder, params_dict, train_dataset, valid_dataset, output_transformers, task_types, metric, logdir=None): optimizer = HyperparamOpt(model_builder, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, output_transformers, metric, logdir=logdir)
def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Tensorflow multitask deepchem classification API.""" splittype = "scaffold" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict = {"activation": ["relu"], "momentum": [.9], "batch_size": [50], "init": ["glorot_uniform"], "data_shape": [train_dataset.get_data_shape()], "learning_rate": [1e-3], "decay": [1e-6], "nb_hidden": [1000], "nb_epoch": [1], "nesterov": [False], "dropouts": [(.5,)], "nb_layers": [1], "batchnorm": [False], "layer_sizes": [(1000,)], "weight_init_stddevs": [(.1,)], "bias_init_consts": [(1.,)], "num_classes": [2], "penalty": [0.], "optimizer": ["sgd"], "num_classification_tasks": [len(task_types)] } def model_builder(tasks, task_types, params_dict, logdir, verbosity=None): return TensorflowModel( tasks, task_types, params_dict, logdir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) optimizer = HyperparamOpt(model_builder, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_singletask_to_multitask_sklearn_hyperparam_opt(self): """Test of hyperparam_opt with singletask_to_multitask.""" tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] input_file = "multitask_example.csv" n_features = 10 n_tasks = len(tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train, tasks) # Define validation dataset n_valid = 10 X_valid = np.random.rand(n_valid, n_features) y_valid = np.random.randint(2, size=(n_valid, n_tasks)) w_valid = np.ones_like(y_valid) ids_valid = ["C"] * n_valid valid_dataset = DiskDataset.from_numpy(self.valid_dir, X_valid, y_valid, w_valid, ids_valid, tasks) transformers = [] classification_metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict = {"n_estimators": [1, 10]} def multitask_model_builder(model_params, model_dir): def model_builder(model_dir): sklearn_model = RandomForestClassifier(**model_params) return SklearnModel(sklearn_model, model_dir) return SingletaskToMultitask(tasks, model_builder, model_dir) optimizer = HyperparamOpt(multitask_model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, classification_metric, logdir=None)
def test_singletask_to_multitask_sklearn_hyperparam_opt(self): """Test of hyperparam_opt with singletask_to_multitask.""" splittype = "scaffold" output_transformers = [] tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: "classification" for task in tasks} input_file = "multitask_example.csv" n_features = 10 n_tasks = len(tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = Dataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train, tasks) # Define validation dataset n_valid = 10 X_valid = np.random.rand(n_valid, n_features) y_valid = np.random.randint(2, size=(n_valid, n_tasks)) w_valid = np.ones_like(y_valid) ids_valid = ["C"] * n_valid valid_dataset = Dataset.from_numpy(self.valid_dir, X_valid, y_valid, w_valid, ids_valid, tasks) params_dict = { "batch_size": [32], "data_shape": [train_dataset.get_data_shape()], } classification_metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") def model_builder(tasks, task_types, model_params, task_model_dir, verbosity=None): return SklearnModel(tasks, task_types, model_params, task_model_dir, model_instance=LogisticRegression()) def multitask_model_builder(tasks, task_types, params_dict, logdir=None, verbosity=None): return SingletaskToMultitask(tasks, task_types, params_dict, self.model_dir, model_builder) optimizer = HyperparamOpt(multitask_model_builder, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, output_transformers, classification_metric, logdir=None)
def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Keras multitask deepchem classification API.""" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] n_features = 1024 featurizer = CircularFingerprint(size=n_features) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict = {"n_hidden": [5, 10]} def model_builder(model_params, model_dir): keras_model = MultiTaskDNN(len(tasks), n_features, task_type, dropout=0., **model_params) return KerasModel(keras_model, model_dir) optimizer = HyperparamOpt(model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Keras multitask deepchem classification API.""" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict= {"nb_hidden": [5, 10], "activation": ["relu"], "dropout": [.5], "learning_rate": [.01], "momentum": [.9], "nesterov": [False], "decay": [1e-4], "batch_size": [5], "nb_epoch": [2], "init": ["glorot_uniform"], "nb_layers": [1], "batchnorm": [False], "data_shape": [train_dataset.get_data_shape()]} optimizer = HyperparamOpt(MultiTaskDNN, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): """Test of hyperparam_opt with singletask RF ECFP regression API.""" featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) params_dict = {"n_estimators": [10, 100]} metric = Metric(metrics.r2_score) def rf_model_builder(model_params, model_dir): sklearn_model = RandomForestRegressor(**model_params) return SklearnModel(sklearn_model, model_dir) optimizer = HyperparamOpt(rf_model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): """Test of hyperparam_opt with singletask RF ECFP regression API.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) params_dict = { "n_estimators": [10, 100], "max_features": ["auto"], "data_shape": train_dataset.get_data_shape() } metric = Metric(metrics.r2_score) optimizer = HyperparamOpt(rf_model_builder, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, output_transformers, metric, logdir=None)
def bace_rf_model(mode="classification", verbosity="high", split="20-80"): """Train random forests on BACE dataset.""" (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset, transformers) = load_bace(mode=mode, transform=False, split=split) if mode == "regression": r2_metric = Metric(metrics.r2_score, verbosity=verbosity) rms_metric = Metric(metrics.rms_score, verbosity=verbosity) mae_metric = Metric(metrics.mae_score, verbosity=verbosity) all_metrics = [r2_metric, rms_metric, mae_metric] metric = r2_metric model_class = RandomForestRegressor elif mode == "classification": roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity) mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity) # Note sensitivity = recall recall_metric = Metric(metrics.recall_score, verbosity=verbosity) model_class = RandomForestClassifier all_metrics = [ accuracy_metric, mcc_metric, recall_metric, roc_auc_metric ] metric = roc_auc_metric else: raise ValueError("Invalid mode %s" % mode) def model_builder(tasks, task_types, params_dict, model_dir, verbosity=verbosity): n_estimators = params_dict["n_estimators"] max_features = params_dict["max_features"] return SklearnModel(tasks, task_types, params_dict, model_dir, model_instance=model_class( n_estimators=n_estimators, max_features=max_features)) params_dict = { "n_estimators": [10, 100], "batch_size": [None], "data_shape": [train_dataset.get_data_shape()], "max_features": ["auto", "sqrt", "log2", None], } optimizer = HyperparamOpt(model_builder, bace_tasks, {task: mode for task in bace_tasks}) best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric=metric) if len(train_dataset) > 0: rf_train_evaluator = Evaluator(best_rf, train_dataset, transformers, verbosity=verbosity) csv_out = "rf_%s_%s_train.csv" % (mode, split) stats_out = "rf_%s_%s_train_stats.txt" % (mode, split) rf_train_score = rf_train_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("RF Train set scores: %s" % (str(rf_train_score))) if len(valid_dataset) > 0: rf_valid_evaluator = Evaluator(best_rf, valid_dataset, transformers, verbosity=verbosity) csv_out = "rf_%s_%s_valid.csv" % (mode, split) stats_out = "rf_%s_%s_valid_stats.txt" % (mode, split) rf_valid_score = rf_valid_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("RF Valid set scores: %s" % (str(rf_valid_score))) if len(test_dataset) > 0: rf_test_evaluator = Evaluator(best_rf, test_dataset, transformers, verbosity=verbosity) csv_out = "rf_%s_%s_test.csv" % (mode, split) stats_out = "rf_%s_%s_test_stats.txt" % (mode, split) rf_test_score = rf_test_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("RF Test set: %s" % (str(rf_test_score))) if len(crystal_dataset) > 0: rf_crystal_evaluator = Evaluator(best_rf, crystal_dataset, transformers, verbosity) csv_out = "rf_%s_%s_crystal.csv" % (mode, split) stats_out = "rf_%s_%s_crystal_stats.txt" % (mode, split) rf_crystal_score = rf_crystal_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("RF Crystal set: %s" % (str(rf_crystal_score)))
def bace_dnn_model(mode="classification", verbosity="high", split="20-80"): """Train fully-connected DNNs on BACE dataset.""" (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset, transformers) = load_bace(mode=mode, transform=True, split=split) if mode == "regression": r2_metric = Metric(metrics.r2_score, verbosity=verbosity) rms_metric = Metric(metrics.rms_score, verbosity=verbosity) mae_metric = Metric(metrics.mae_score, verbosity=verbosity) all_metrics = [r2_metric, rms_metric, mae_metric] metric = r2_metric elif mode == "classification": roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity) mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity) # Note sensitivity = recall recall_metric = Metric(metrics.recall_score, verbosity=verbosity) all_metrics = [accuracy_metric, mcc_metric, recall_metric, roc_auc_metric] metric = roc_auc_metric else: raise ValueError("Invalid mode %s" % mode) params_dict = {"activation": ["relu"], "momentum": [.9], "batch_size": [50], "init": ["glorot_uniform"], "data_shape": [train_dataset.get_data_shape()], "learning_rate": np.power(10., np.random.uniform(-5, -3, size=5)), "decay": np.power(10, np.random.uniform(-6, -4, size=5)), "nb_hidden": [1000], "nb_epoch": [40], "nesterov": [False], "dropout": [.5], "nb_layers": [1], "batchnorm": [False], } optimizer = HyperparamOpt(SingleTaskDNN, bace_tasks, {task: mode for task in bace_tasks}, verbosity=verbosity) best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric=metric) if len(train_dataset) > 0: dnn_train_evaluator = Evaluator(best_dnn, train_dataset, transformers) csv_out = "dnn_%s_%s_train.csv" % (mode, split) stats_out = "dnn_%s_%s_train_stats.txt" % (mode, split) dnn_train_score = dnn_train_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Train set %s: %s" % (metric.name, str(dnn_train_score))) if len(valid_dataset) > 0: dnn_valid_evaluator = Evaluator(best_dnn, valid_dataset, transformers) csv_out = "dnn_%s_%s_valid.csv" % (mode, split) stats_out = "dnn_%s_%s_valid_stats.txt" % (mode, split) dnn_valid_score = dnn_valid_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Valid set %s: %s" % (metric.name, str(dnn_valid_score))) if len(test_dataset) > 0: dnn_test_evaluator = Evaluator(best_dnn, test_dataset, transformers) csv_out = "dnn_%s_%s_test.csv" % (mode, split) stats_out = "dnn_%s_%s_test_stats.txt" % (mode, split) dnn_test_score = dnn_test_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Test set %s: %s" % (metric.name, str(dnn_test_score))) if len(crystal_dataset) > 0: dnn_crystal_evaluator = Evaluator(best_dnn, crystal_dataset, transformers) csv_out = "dnn_%s_%s_crystal.csv" % (mode, split) stats_out = "dnn_%s_%s_crystal_stats.txt" % (mode, split) dnn_crystal_score = dnn_crystal_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Crystal set %s: %s" % (metric.name, str(dnn_crystal_score)))
def bace_dnn_model(mode="classification", verbosity="high", split="20-80"): """Train fully-connected DNNs on BACE dataset.""" (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset, transformers) = load_bace(mode=mode, transform=True, split=split) if mode == "regression": r2_metric = Metric(metrics.r2_score, verbosity=verbosity) rms_metric = Metric(metrics.rms_score, verbosity=verbosity) mae_metric = Metric(metrics.mae_score, verbosity=verbosity) all_metrics = [r2_metric, rms_metric, mae_metric] metric = r2_metric elif mode == "classification": roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity) mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity) # Note sensitivity = recall recall_metric = Metric(metrics.recall_score, verbosity=verbosity) all_metrics = [ accuracy_metric, mcc_metric, recall_metric, roc_auc_metric ] metric = roc_auc_metric else: raise ValueError("Invalid mode %s" % mode) params_dict = { "activation": ["relu"], "momentum": [.9], "batch_size": [50], "init": ["glorot_uniform"], "data_shape": [train_dataset.get_data_shape()], "learning_rate": np.power(10., np.random.uniform(-5, -3, size=5)), "decay": np.power(10, np.random.uniform(-6, -4, size=5)), "nb_hidden": [1000], "nb_epoch": [40], "nesterov": [False], "dropout": [.5], "nb_layers": [1], "batchnorm": [False], } optimizer = HyperparamOpt(SingleTaskDNN, bace_tasks, {task: mode for task in bace_tasks}, verbosity=verbosity) best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric=metric) if len(train_dataset) > 0: dnn_train_evaluator = Evaluator(best_dnn, train_dataset, transformers) csv_out = "dnn_%s_%s_train.csv" % (mode, split) stats_out = "dnn_%s_%s_train_stats.txt" % (mode, split) dnn_train_score = dnn_train_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Train set %s: %s" % (metric.name, str(dnn_train_score))) if len(valid_dataset) > 0: dnn_valid_evaluator = Evaluator(best_dnn, valid_dataset, transformers) csv_out = "dnn_%s_%s_valid.csv" % (mode, split) stats_out = "dnn_%s_%s_valid_stats.txt" % (mode, split) dnn_valid_score = dnn_valid_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Valid set %s: %s" % (metric.name, str(dnn_valid_score))) if len(test_dataset) > 0: dnn_test_evaluator = Evaluator(best_dnn, test_dataset, transformers) csv_out = "dnn_%s_%s_test.csv" % (mode, split) stats_out = "dnn_%s_%s_test_stats.txt" % (mode, split) dnn_test_score = dnn_test_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Test set %s: %s" % (metric.name, str(dnn_test_score))) if len(crystal_dataset) > 0: dnn_crystal_evaluator = Evaluator(best_dnn, crystal_dataset, transformers) csv_out = "dnn_%s_%s_crystal.csv" % (mode, split) stats_out = "dnn_%s_%s_crystal_stats.txt" % (mode, split) dnn_crystal_score = dnn_crystal_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Crystal set %s: %s" % (metric.name, str(dnn_crystal_score)))
def bace_rf_model(mode="classification", verbosity="high", split="20-80"): """Train random forests on BACE dataset.""" (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset, transformers) = load_bace(mode=mode, transform=False, split=split) if mode == "regression": r2_metric = Metric(metrics.r2_score, verbosity=verbosity) rms_metric = Metric(metrics.rms_score, verbosity=verbosity) mae_metric = Metric(metrics.mae_score, verbosity=verbosity) all_metrics = [r2_metric, rms_metric, mae_metric] metric = r2_metric model_class = RandomForestRegressor elif mode == "classification": roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity) mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity) # Note sensitivity = recall recall_metric = Metric(metrics.recall_score, verbosity=verbosity) model_class = RandomForestClassifier all_metrics = [accuracy_metric, mcc_metric, recall_metric, roc_auc_metric] metric = roc_auc_metric else: raise ValueError("Invalid mode %s" % mode) def model_builder(tasks, task_types, params_dict, model_dir, verbosity=verbosity): n_estimators = params_dict["n_estimators"] max_features = params_dict["max_features"] return SklearnModel( tasks, task_types, params_dict, model_dir, model_instance=model_class(n_estimators=n_estimators, max_features=max_features)) params_dict = { "n_estimators": [10, 100], "batch_size": [None], "data_shape": [train_dataset.get_data_shape()], "max_features": ["auto", "sqrt", "log2", None], } optimizer = HyperparamOpt(model_builder, bace_tasks, {task: mode for task in bace_tasks}) best_rf, best_rf_hyperparams, all_rf_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric=metric) if len(train_dataset) > 0: rf_train_evaluator = Evaluator(best_rf, train_dataset, transformers, verbosity=verbosity) csv_out = "rf_%s_%s_train.csv" % (mode, split) stats_out = "rf_%s_%s_train_stats.txt" % (mode, split) rf_train_score = rf_train_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("RF Train set scores: %s" % (str(rf_train_score))) if len(valid_dataset) > 0: rf_valid_evaluator = Evaluator(best_rf, valid_dataset, transformers, verbosity=verbosity) csv_out = "rf_%s_%s_valid.csv" % (mode, split) stats_out = "rf_%s_%s_valid_stats.txt" % (mode, split) rf_valid_score = rf_valid_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("RF Valid set scores: %s" % (str(rf_valid_score))) if len(test_dataset) > 0: rf_test_evaluator = Evaluator(best_rf, test_dataset, transformers, verbosity=verbosity) csv_out = "rf_%s_%s_test.csv" % (mode, split) stats_out = "rf_%s_%s_test_stats.txt" % (mode, split) rf_test_score = rf_test_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("RF Test set: %s" % (str(rf_test_score))) if len(crystal_dataset) > 0: rf_crystal_evaluator = Evaluator(best_rf, crystal_dataset, transformers, verbosity) csv_out = "rf_%s_%s_crystal.csv" % (mode, split) stats_out = "rf_%s_%s_crystal_stats.txt" % (mode, split) rf_crystal_score = rf_crystal_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("RF Crystal set: %s" % (str(rf_crystal_score)))
def bace_dnn_model(mode="classification", verbosity="high", split="20-80"): """Train fully-connected DNNs on BACE dataset.""" (bace_tasks, train_dataset, valid_dataset, test_dataset, crystal_dataset, transformers) = load_bace(mode=mode, transform=True, split=split) if mode == "regression": r2_metric = Metric(metrics.r2_score, verbosity=verbosity) rms_metric = Metric(metrics.rms_score, verbosity=verbosity) mae_metric = Metric(metrics.mae_score, verbosity=verbosity) all_metrics = [r2_metric, rms_metric, mae_metric] metric = r2_metric elif mode == "classification": roc_auc_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) accuracy_metric = Metric(metrics.accuracy_score, verbosity=verbosity) mcc_metric = Metric(metrics.matthews_corrcoef, verbosity=verbosity) # Note sensitivity = recall recall_metric = Metric(metrics.recall_score, verbosity=verbosity) all_metrics = [ accuracy_metric, mcc_metric, recall_metric, roc_auc_metric ] metric = roc_auc_metric else: raise ValueError("Invalid mode %s" % mode) params_dict = { "learning_rate": np.power(10., np.random.uniform(-5, -3, size=5)), "decay": np.power(10, np.random.uniform(-6, -4, size=5)), "nb_epoch": [40] } n_features = train_dataset.get_data_shape()[0] def model_builder(model_params, model_dir): keras_model = MultiTaskDNN(len(bace_tasks), n_features, "classification", dropout=.5, **model_params) return KerasModel(keras_model, model_dir) optimizer = HyperparamOpt(model_builder, verbosity="low") best_dnn, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric=metric) if len(train_dataset) > 0: dnn_train_evaluator = Evaluator(best_dnn, train_dataset, transformers) csv_out = "dnn_%s_%s_train.csv" % (mode, split) stats_out = "dnn_%s_%s_train_stats.txt" % (mode, split) dnn_train_score = dnn_train_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Train set %s: %s" % (metric.name, str(dnn_train_score))) if len(valid_dataset) > 0: dnn_valid_evaluator = Evaluator(best_dnn, valid_dataset, transformers) csv_out = "dnn_%s_%s_valid.csv" % (mode, split) stats_out = "dnn_%s_%s_valid_stats.txt" % (mode, split) dnn_valid_score = dnn_valid_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Valid set %s: %s" % (metric.name, str(dnn_valid_score))) if len(test_dataset) > 0: dnn_test_evaluator = Evaluator(best_dnn, test_dataset, transformers) csv_out = "dnn_%s_%s_test.csv" % (mode, split) stats_out = "dnn_%s_%s_test_stats.txt" % (mode, split) dnn_test_score = dnn_test_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Test set %s: %s" % (metric.name, str(dnn_test_score))) if len(crystal_dataset) > 0: dnn_crystal_evaluator = Evaluator(best_dnn, crystal_dataset, transformers) csv_out = "dnn_%s_%s_crystal.csv" % (mode, split) stats_out = "dnn_%s_%s_crystal_stats.txt" % (mode, split) dnn_crystal_score = dnn_crystal_evaluator.compute_model_performance( all_metrics, csv_out=csv_out, stats_out=stats_out) print("DNN Crystal set %s: %s" % (metric.name, str(dnn_crystal_score)))