def test_multitask_keras_mlp_ECFP_classification_API(self): """Straightforward test of Keras multitask deepchem classification API.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. # TODO(rbharath): Turning off dropout to make tests behave. model_params = {"nb_hidden": 10, "activation": "relu", "dropout": .0, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False} input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [] model_params["data_shape"] = train_dataset.get_data_shape() classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_keras_reload(self): """Test that trained keras models can be reloaded correctly.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) keras_model = MultiTaskDNN(n_tasks, n_features, "classification", dropout=0.) model = KerasModel(keras_model, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_keras_model = MultiTaskDNN(n_tasks, n_features, "classification", dropout=0.) reloaded_model = KerasModel(reloaded_keras_model, self.model_dir) reloaded_model.reload( custom_objects={"MultiTaskDNN": MultiTaskDNN}) # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance( [classification_metric]) assert scores[classification_metric.name] > .6
def test_keras_skewed_classification_overfit(self): """Test keras models can overfit 0/1 datasets with few actives.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 100 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "nb_hidden": 1000, "activation": "relu", "dropout": .0, "learning_rate": .15, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": n_samples, "nb_epoch": 200, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_keras_multitask_regression_overfit(self): """Test keras multitask overfits tiny data.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): n_tasks = 10 n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity, task_averager=np.mean, mode="regression") keras_model = MultiTaskDNN(n_tasks, n_features, "regression", dropout=0., learning_rate=.1, decay=1e-4) model = KerasModel(keras_model, self.model_dir, verbosity=verbosity) # Fit trained model model.fit(dataset, nb_epoch=100) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .75
def model_builder(model_params, model_dir): keras_model = MultiTaskDNN(len(tasks), n_features, task_type, dropout=0., **model_params) return KerasModel(keras_model, model_dir)
def model_builder(model_params, model_dir): keras_model = MultiTaskDNN(len(bace_tasks), n_features, "classification", dropout=.5, **model_params) return KerasModel(keras_model, model_dir)
def test_keras_skewed_classification_overfit(self): """Test keras models can overfit 0/1 datasets with few actives.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 100 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "nb_hidden": 1000, "activation": "relu", "dropout": .0, "learning_rate": .15, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": n_samples, "nb_epoch": 200, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False, "data_shape": dataset.get_data_shape() } verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_keras_multitask_regression_overfit(self): """Test keras multitask overfits tiny data.""" n_tasks = 10 tasks = ["task%d" % task for task in range(n_tasks)] task_types = {task: "regression" for task in tasks} n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = Dataset.from_numpy(self.train_dir, X, y, w, ids, tasks) model_params = { "nb_hidden": 1000, "activation": "relu", "dropout": .0, "learning_rate": .15, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": n_samples, "nb_epoch": 200, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False, "data_shape": dataset.get_data_shape() } verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir, verbosity=verbosity) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .9
def test_multitask_keras_mlp_ECFP_classification_API(self): """Straightforward test of Keras multitask deepchem classification API.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] n_features = 1024 featurizer = CircularFingerprint(size=n_features) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [] classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] keras_model = MultiTaskDNN(len(tasks), n_features, "classification", dropout=0.) model = KerasModel(keras_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_keras_skewed_classification_overfit(self): """Test keras models can overfit 0/1 datasets with few actives.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): n_samples = 100 n_features = 3 n_tasks = 1 # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) keras_model = MultiTaskDNN(n_tasks, n_features, "classification", dropout=0., learning_rate=.15, decay=1e-4) model = KerasModel(keras_model, self.model_dir) # Fit trained model model.fit(dataset, batch_size=n_samples, nb_epoch=200) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
os.makedirs(base_dir) # Load MUV data muv_tasks, muv_datasets, transformers = load_muv(base_dir, reload=reload) train_dataset, valid_dataset = muv_datasets n_features = 1024 # Build model classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") keras_model = MultiTaskDNN(len(muv_tasks), n_features, "classification", dropout=.25, learning_rate=.001, decay=1e-4) model = KerasModel(keras_model, self.model_dir, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric])
"momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 64, "nb_epoch": 10, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False, "data_shape": train_dataset.get_data_shape() } if os.path.exists(model_dir): shutil.rmtree(model_dir) os.makedirs(model_dir) model = MultiTaskDNN(muv_tasks, muv_task_types, params_dict, model_dir, verbosity=verbosity) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([classification_metric]) print("Train scores") print(train_scores) valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity) valid_scores = valid_evaluator.compute_model_performance([classification_metric]) print("Validation scores")
def test_multitask_keras_mlp_ECFP_classification_API(self): """Straightforward test of Keras multitask deepchem classification API.""" from deepchem.models.keras_models.fcnet import MultiTaskDNN task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. model_params = { "nb_hidden": 10, "activation": "relu", "dropout": .5, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2, "init": "glorot_uniform", "nb_layers": 1, "batchnorm": False } input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [] model_params["data_shape"] = train_dataset.get_data_shape() classification_metrics = [ Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score) ] model = MultiTaskDNN(tasks, task_types, model_params, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)