def split(self, dataset, split_dirs, frac_split): """ Method that does bulk of splitting dataset. """ assert len(split_dirs) == 2 # Handle edge case where frac_split is 1 if frac_split == 1: dataset_1 = NumpyDataset(dataset.X, dataset.y, dataset.w, dataset.ids) dataset_2 = None return dataset_1, dataset_2 X, y, w, ids = randomize_arrays( (dataset.X, dataset.y, dataset.w, dataset.ids)) split_indices = self.get_task_split_indices(y, w, frac_split) # Create weight matrices fpor two haves. w_1, w_2 = np.zeros_like(w), np.zeros_like(w) for task, split_index in enumerate(split_indices): # copy over up to required index for weight first_split w_1[:split_index, task] = w[:split_index, task] w_2[split_index:, task] = w[split_index:, task] # check out if any rows in either w_1 or w_2 are just zeros rows_1 = w_1.any(axis=1) X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1] dataset_1 = NumpyDataset(X_1, y_1, w_1, ids_1) rows_2 = w_2.any(axis=1) X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2] dataset_2 = NumpyDataset(X_2, y_2, w_2, ids_2) return dataset_1, dataset_2
def test_sklearn_classification_overfit(self): """Test that sklearn models can overfit simple classification datasets.""" n_samples = 10 n_features = 3 n_tasks = 1 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) sklearn_model = RandomForestClassifier() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_keras_multitask_regression_overfit(self): """Test keras multitask overfits tiny data.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): n_tasks = 10 n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity, task_averager=np.mean, mode="regression") keras_model = MultiTaskDNN(n_tasks, n_features, "regression", dropout=0., learning_rate=.1, decay=1e-4) model = KerasModel(keras_model, self.model_dir, verbosity=verbosity) # Fit trained model model.fit(dataset, nb_epoch=100) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .75
def test_tf_classification_overfit(self): """Test that tensorflow models can overfit simple classification datasets.""" n_samples = 10 n_features = 3 n_tasks = 1 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity) tensorflow_model = TensorflowMultiTaskClassifier( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset, nb_epoch=100) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def test_tf_multitask_regression_overfit(self): """Test tf multitask overfits tiny data.""" n_tasks = 10 n_samples = 10 n_features = 3 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" regression_metric = Metric(metrics.mean_squared_error, verbosity=verbosity, task_averager=np.mean, mode="regression") tensorflow_model = TensorflowMultiTaskRegressor( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset, nb_epoch=50) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] < .1
def test_tf_regression_overfit(self): """Test that TensorFlow models can overfit simple regression datasets.""" n_samples = 10 n_features = 3 n_tasks = 1 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.zeros((n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" regression_metric = Metric(metrics.mean_squared_error, verbosity=verbosity) # TODO(rbharath): This breaks with optimizer="momentum". Why? tensorflow_model = TensorflowMultiTaskRegressor( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.003, weight_init_stddevs=[np.sqrt(6)/np.sqrt(1000)], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset, nb_epoch=100) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] < .1
def test_itersamples_numpy(self): """Test that iterating over samples in a NumpyDataset works.""" num_datapoints = 100 num_features = 10 num_tasks = 10 # Generate data X = np.random.rand(num_datapoints, num_features) y = np.random.randint(2, size=(num_datapoints, num_tasks)) w = np.random.randint(2, size=(num_datapoints, num_tasks)) ids = np.array(["id"] * num_datapoints) dataset = NumpyDataset(X, y, w, ids) for i, (sx, sy, sw, sid) in enumerate(dataset.itersamples()): np.testing.assert_array_equal(sx, X[i]) np.testing.assert_array_equal(sy, y[i]) np.testing.assert_array_equal(sw, w[i]) np.testing.assert_array_equal(sid, ids[i])
def test_get_shape(self): """Test that get_shape works.""" num_datapoints = 100 num_features = 10 num_tasks = 10 # Generate data X = np.random.rand(num_datapoints, num_features) y = np.random.randint(2, size=(num_datapoints, num_tasks)) w = np.random.randint(2, size=(num_datapoints, num_tasks)) ids = np.array(["id"] * num_datapoints) dataset = NumpyDataset(X, y, w, ids) X_shape, y_shape, w_shape, ids_shape = dataset.get_shape() assert X_shape == X.shape assert y_shape == y.shape assert w_shape == w.shape assert ids_shape == ids.shape
def test_keras_reload(self): """Test that trained keras models can be reloaded correctly.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) keras_model = MultiTaskDNN(n_tasks, n_features, "classification", dropout=0.) model = KerasModel(keras_model, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_keras_model = MultiTaskDNN(n_tasks, n_features, "classification", dropout=0.) reloaded_model = KerasModel(reloaded_keras_model, self.model_dir) reloaded_model.reload( custom_objects={"MultiTaskDNN": MultiTaskDNN}) # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance( [classification_metric]) assert scores[classification_metric.name] > .6
def predict_proba_on_batch(self, support, test_batch): """Make predictions on batch of data.""" n_samples = len(test_batch) padded_test_batch = NumpyDataset( *pad_batch(self.test_batch_size, test_batch.X, test_batch.y, test_batch.w, test_batch.ids)) feed_dict = self.construct_feed_dict(padded_test_batch, support) # Get scores pred, scores = self.sess.run([self.pred_op, self.scores_op], feed_dict=feed_dict) y_pred_batch = to_one_hot(np.round(pred)) return y_pred_batch
def get_task_dataset(dataset, task): """Selects out entries for a particular task.""" X, y, w, ids = dataset.X, dataset.y, dataset.w, dataset.ids # Get task specific entries w_task = w[:, task] X_task = X[w_task != 0] y_task = y[w_task != 0, task] ids_task = ids[w_task != 0] # Now just get weights for this task w_task = w[w_task != 0, task] return NumpyDataset(X_task, y_task, w_task, ids_task)
def test_tf_reload(self): """Test that tensorflow models can overfit simple classification datasets.""" n_samples = 10 n_features = 3 n_tasks = 1 n_classes = 2 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(n_classes, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.accuracy_score, verbosity=verbosity) tensorflow_model = TensorflowMultiTaskClassifier(n_tasks, n_features, self.model_dir, dropouts=[0.], verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_tensorflow_model = TensorflowMultiTaskClassifier( n_tasks, n_features, self.model_dir, dropouts=[0.], verbosity=verbosity) reloaded_model = TensorflowModel(reloaded_tensorflow_model, self.model_dir) reloaded_model.reload() # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .6
def predict_proba(self, support, test): """Makes predictions on test given support. TODO(rbharath): Does not currently support any transforms. TODO(rbharath): Only for 1 task at a time currently. Is there a better way? """ y_preds = [] for (X_batch, y_batch, w_batch, ids_batch) in test.iterbatches(self.test_batch_size, deterministic=True): test_batch = NumpyDataset(X_batch, y_batch, w_batch, ids_batch) y_pred_batch = self.predict_proba_on_batch(support, test_batch) y_preds.append(y_pred_batch) y_pred = np.concatenate(y_preds) return y_pred
def test_tf_skewed_missing_classification_overfit(self): """TF, skewed data, few actives Test tensorflow models overfit 0/1 datasets with missing data and few actives. This is intended to be as close to singletask MUV datasets as possible. """ n_samples = 5120 n_features = 6 n_tasks = 1 n_classes = 2 # Generate dummy dataset np.random.seed(123) p = .002 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) y_flat, w_flat = np.squeeze(y), np.squeeze(w) y_nonzero = y_flat[w_flat != 0] num_nonzero = np.count_nonzero(y_nonzero) weight_nonzero = len(y_nonzero)/num_nonzero w_flat[y_flat != 0] = weight_nonzero w = np.reshape(w_flat, (n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) tensorflow_model = TensorflowMultiTaskClassifier( n_tasks, n_features, self.model_dir, dropouts=[0.], learning_rate=0.003, weight_init_stddevs=[1.], batch_size=n_samples, verbosity=verbosity) model = TensorflowModel(tensorflow_model, self.model_dir) # Fit trained model model.fit(dataset, nb_epoch=50) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .8
def get_task_support(dataset, n_pos, n_neg, task, replace=True): """Generates a support set purely for specified task. Parameters ---------- datasets: deepchem.datasets.Dataset Dataset from which supports are sampled. n_pos: int Number of positive samples in support. n_neg: int Number of negative samples in support. task: int Index of current task. replace: bool, optional Whether or not to use replacement when sampling supports. Returns ------- list List of NumpyDatasets, each of which is a support set. """ y_task = dataset.y[:, task] # Split data into pos and neg lists. pos_mols = np.where(y_task == 1)[0] neg_mols = np.where(y_task == 0)[0] # Get randomly sampled pos/neg indices (with replacement) pos_inds = pos_mols[np.random.choice(len(pos_mols), (n_pos), replace=replace)] neg_inds = neg_mols[np.random.choice(len(neg_mols), (n_neg), replace=replace)] # Handle one-d vs. non one-d feature matrices one_dimensional_features = (len(dataset.X.shape) == 1) if not one_dimensional_features: X_trial = np.vstack([dataset.X[pos_inds], dataset.X[neg_inds]]) else: X_trial = np.concatenate([dataset.X[pos_inds], dataset.X[neg_inds]]) y_trial = np.concatenate( [dataset.y[pos_inds, task], dataset.y[neg_inds, task]]) w_trial = np.concatenate( [dataset.w[pos_inds, task], dataset.w[neg_inds, task]]) ids_trial = np.concatenate([dataset.ids[pos_inds], dataset.ids[neg_inds]]) return NumpyDataset(X_trial, y_trial, w_trial, ids_trial)
def test_sklearn_reload(self): """Test that trained model can be reloaded correctly.""" tasks = ["task0"] task_types = {task: "classification" for task in tasks} n_samples = 10 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) sklearn_model = RandomForestClassifier() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Load trained model reloaded_model = SklearnModel(None, self.model_dir) reloaded_model.reload() # Eval model on train transformers = [] evaluator = Evaluator(reloaded_model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9
def get_task_test(dataset, batch_size, task, replace=True): """Gets test set from specified task. Samples random subset of size batch_size from specified task of dataset. Ensures that sampled points have measurements for this task. """ w_task = dataset.w[:, task] X_task = dataset.X[w_task != 0] y_task = dataset.y[w_task != 0] ids_task = dataset.ids[w_task != 0] # Now just get weights for this task w_task = dataset.w[w_task != 0] inds = np.random.choice(np.arange(len(X_task)), batch_size, replace=replace) X_batch = X_task[inds] y_batch = np.squeeze(y_task[inds, task]) w_batch = np.squeeze(w_task[inds, task]) ids_batch = ids_task[inds] return NumpyDataset(X_batch, y_batch, w_batch, ids_batch)
def get_task_dataset_minus_support(dataset, support, task): """Gets data for specified task, minus support points. Useful for evaluating model performance once trained (so that test compounds can be ensured distinct from support.) Parameters ---------- dataset: deepchem.datasets.Dataset Source dataset. support: deepchem.datasets.Dataset The support dataset task: int Task number of task to select. """ support_ids = set(support.ids) non_support_inds = [ ind for ind in range(len(dataset)) if dataset.ids[ind] not in support_ids ] # Remove support indices X = dataset.X[non_support_inds] y = dataset.y[non_support_inds] w = dataset.w[non_support_inds] ids = dataset.ids[non_support_inds] # Get task specific entries w_task = w[:, task] X_task = X[w_task != 0] y_task = y[w_task != 0, task] ids_task = ids[w_task != 0] # Now just get weights for this task w_task = w[w_task != 0, task] return NumpyDataset(X_task, y_task, w_task, ids_task)
def test_keras_skewed_classification_overfit(self): """Test keras models can overfit 0/1 datasets with few actives.""" g = tf.Graph() sess = tf.Session(graph=g) K.set_session(sess) with g.as_default(): n_samples = 100 n_features = 3 n_tasks = 1 # Generate dummy dataset np.random.seed(123) p = .05 ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.binomial(1, p, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = NumpyDataset(X, y, w, ids) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) keras_model = MultiTaskDNN(n_tasks, n_features, "classification", dropout=0., learning_rate=.15, decay=1e-4) model = KerasModel(keras_model, self.model_dir) # Fit trained model model.fit(dataset, batch_size=n_samples, nb_epoch=200) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .9