def test_tf_reload(self): """Test that tensorflow models can overfit simple classification datasets.""" n_samples = 10 n_features = 3 n_tasks = 1 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(n_classes, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity) tensorflow_model = TensorflowMultiTaskClassifier(n_tasks, n_features, self.model_dir, dropouts=[0.], verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_tensorflow_model = TensorflowMultiTaskClassifier( n_tasks, n_features, self.model_dir, dropouts=[0.], verbosity=verbosity) reloaded_model = TensorflowModel(reloaded_tensorflow_model, self.model_dir) reloaded_model.reload() # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .6
def test_tf_multitask_classification_overfit(self): """Test tf multitask overfits tiny data.""" n_tasks = 10 n_samples = 10 n_features = 3 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity, task_averager=np.mean) tensorflow_model = TensorflowMultiTaskClassifier( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_tf_skewed_missing_classification_overfit(self): """TF, skewed data, few actives Test tensorflow models overfit 0/1 datasets with missing data and few actives. This is intended to be as close to singletask MUV datasets as possible. """ n_samples = 5120 n_features = 6 n_tasks = 1 n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .002 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) y_flat, w_flat = np.squeeze(y), np.squeeze(w) y_nonzero = y_flat[w_flat != 0] num_nonzero = np.count_nonzero(y_nonzero) weight_nonzero = len(y_nonzero)/num_nonzero w_flat[y_flat != 0] = weight_nonzero w = np.reshape(w_flat, (n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) tensorflow_model = TensorflowMultiTaskClassifier( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.003, weight_init_stddevs=[1.], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset, nb_epoch=50) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
def test_singletask_tf_mlp_ECFP_classification_API(self): """Straightforward test of Tensorflow singletask deepchem classification API.""" n_features = 1024 featurizer = CircularFingerprint(size=n_features) tasks = ["outcome"] input_file = os.path.join(self.current_dir, "example_classification.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, test_dataset = splitter.train_test_split( dataset, self.train_dir, self.test_dir) transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) classification_metrics = [Metric(metrics.roc_auc_score), Metric(metrics.matthews_corrcoef), Metric(metrics.recall_score), Metric(metrics.accuracy_score)] tensorflow_model = TensorflowMultiTaskClassifier( len(tasks), n_features, self.model_dir) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train evaluator = Evaluator(model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_tf_skewed_classification_overfit(self): """Test tensorflow models can overfit 0/1 datasets with few actives.""" #n_samples = 100 n_samples = 100 n_features = 3 n_tasks = 1 n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) tensorflow_model = TensorflowMultiTaskClassifier( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.003, weight_init_stddevs=[.1], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset, nb_epoch=100) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .75
from deepchem.utils.evaluate import Evaluator from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier np.random.seed(123) pcba_tasks, pcba_datasets, transformers = load_pcba() (train_dataset, valid_dataset, test_dataset) = pcba_datasets metric = Metric(metrics.roc_auc_score, np.mean, mode="classification") n_features = train_dataset.get_data_shape()[0] model_dir = None model = TensorflowMultiTaskClassifier(len(pcba_tasks), n_features, model_dir, dropouts=[.25], learning_rate=0.001, weight_init_stddevs=[.1], batch_size=64, verbosity="high") # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([metric]) print("Train scores")
shutil.rmtree(base_dir) os.makedirs(base_dir) # Load Tox21 dataset n_features = 1024 tox21_tasks, tox21_datasets, transformers = load_tox21(data_dir, reload=False) # Do train/valid split. train_dataset, valid_dataset = tox21_datasets # Fit models classification_metric = Metric(metrics.roc_auc_score, np.mean, verbosity=verbosity, mode="classification") tensorflow_model = TensorflowMultiTaskClassifier( len(tox21_tasks), n_features, model_dir, dropouts=[.25], learning_rate=0.0003, weight_init_stddevs=[1.], batch_size=32, verbosity=verbosity) model = TensorflowModel(tensorflow_model, model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) print("Train scores") print(train_scores)
def model_builder(model_params, model_dir): tensorflow_model = TensorflowMultiTaskClassifier( len(tasks), n_features, model_dir, **model_params) return TensorflowModel(tensorflow_model, model_dir)
from deepchem.metrics import to_one_hot from deepchem.utils.evaluate import Evaluator from deepchem.models.tensorflow_models.fcnet import TensorflowMultiTaskClassifier from deepchem.models.tensorflow_models import TensorflowModel np.random.seed(123) pcba_tasks, pcba_datasets, transformers = load_pcba() (train_dataset, valid_dataset) = pcba_datasets metric = Metric(metrics.roc_auc_score, np.mean, mode="classification") model = TensorflowMultiTaskClassifier( len(pcba_tasks), n_features, model_dir, dropouts=[.25], learning_rate=0.001, weight_init_stddevs=[.1], batch_size=64, verbosity="high") # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance([metric]) print("Train scores") print(train_scores) valid_evaluator = Evaluator(model, valid_dataset, transformers, verbosity=verbosity) valid_scores = valid_evaluator.compute_model_performance([metric])