def test_singletask_to_multitask_sklearn_hyperparam_opt(self): """Test of hyperparam_opt with singletask_to_multitask.""" tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] input_file = "multitask_example.csv" n_features = 10 n_tasks = len(tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train, tasks) # Define validation dataset n_valid = 10 X_valid = np.random.rand(n_valid, n_features) y_valid = np.random.randint(2, size=(n_valid, n_tasks)) w_valid = np.ones_like(y_valid) ids_valid = ["C"] * n_valid valid_dataset = DiskDataset.from_numpy(self.valid_dir, X_valid, y_valid, w_valid, ids_valid, tasks) transformers = [] classification_metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict = {"n_estimators": [1, 10]} def multitask_model_builder(model_params, model_dir): def model_builder(model_dir): sklearn_model = RandomForestClassifier(**model_params) return SklearnModel(sklearn_model, model_dir) return SingletaskToMultitask(tasks, model_builder, model_dir) optimizer = HyperparamOpt(multitask_model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, classification_metric, logdir=None)
def test_sklearn_multitask_classification(self): """Test that sklearn models can learn on simple multitask classification.""" np.random.seed(123) n_tasks = 4 tasks = range(n_tasks) dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target y = np.reshape(y, (len(y), 1)) y = np.hstack([y] * n_tasks) frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) def model_builder(model_dir): sklearn_model = LogisticRegression() return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(tasks, model_builder, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) for score in scores[classification_metric.name]: assert score > .5
def test_sklearn_transformed_regression(self): """Test that sklearn models can learn on simple transformed regression datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_diabetes() X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test) # Eval model on train transformers = [ NormalizationTransformer(transform_X=True, dataset=train_dataset), ClippingTransformer(transform_X=True, dataset=train_dataset), NormalizationTransformer(transform_y=True, dataset=train_dataset) ] for data in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(data) verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity) sklearn_model = LinearRegression() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [regression_metric]) assert train_scores[regression_metric.name] > .5 # Eval model on test evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .5
def test_singletask_to_multitask_classification(self): n_features = 10 n_tasks = 17 tasks = range(n_tasks) # Define train dataset n_train = 100 X_train = np.random.rand(n_train, n_features) y_train = np.random.randint(2, size=(n_train, n_tasks)) w_train = np.ones_like(y_train) ids_train = ["C"] * n_train train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train, w_train, ids_train) # Define test dataset n_test = 10 X_test = np.random.rand(n_test, n_features) y_test = np.random.randint(2, size=(n_test, n_tasks)) w_test = np.ones_like(y_test) ids_test = ["C"] * n_test test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test, w_test, ids_test) transformers = [] classification_metrics = [Metric(metrics.roc_auc_score)] def model_builder(model_dir): sklearn_model = LogisticRegression() return SklearnModel(sklearn_model, model_dir) multitask_model = SingletaskToMultitask(tasks, model_builder, self.model_dir) # Fit trained model multitask_model.fit(train_dataset) multitask_model.save() # Eval multitask_model on train evaluator = Evaluator(multitask_model, train_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics) # Eval multitask_model on test evaluator = Evaluator(multitask_model, test_dataset, transformers, verbosity=True) _ = evaluator.compute_model_performance(classification_metrics)
def test_shuffle_shards(self): """Test that shuffle_shards works.""" n_samples = 100 n_tasks = 10 n_features = 10 X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.random.randint(2, size=(n_samples, n_tasks)) ids = np.arange(n_samples) dataset = DiskDataset.from_numpy(self.data_dir, X, y, w, ids) dataset.reshard(shard_size=10) dataset.shuffle_shards() X_s, y_s, w_s, ids_s = (dataset.X, dataset.y, dataset.w, dataset.ids) assert X_s.shape == X.shape assert y_s.shape == y.shape assert ids_s.shape == ids.shape assert w_s.shape == w.shape # The ids should now store the performed permutation. Check that the # original dataset is recoverable. for i in range(n_samples): np.testing.assert_array_equal(X_s[i], X[ids_s[i]]) np.testing.assert_array_equal(y_s[i], y[ids_s[i]]) np.testing.assert_array_equal(w_s[i], w[ids_s[i]]) np.testing.assert_array_equal(ids_s[i], ids[ids_s[i]])
def test_singletask_stratified_split(self): """ Test RandomStratifiedSplitter on a singletask split. """ np.random.seed(2314) # Test singletask case. n_samples = 20 n_positives = 10 n_features = 10 n_tasks = 1 X = np.random.rand(n_samples, n_features) y = np.zeros((n_samples, n_tasks)) y[:n_positives] = 1 w = np.ones((n_samples, n_tasks)) ids = np.arange(n_samples) data_dir = tempfile.mkdtemp() dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) stratified_splitter = RandomStratifiedSplitter() split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()] dataset_1, dataset_2 = stratified_splitter.split(dataset, split_dirs, frac_split=.5) # Should have split cleanly in half (picked random seed to ensure this) assert len(dataset_1) == 10 assert len(dataset_2) == 10 # Check positives are correctly distributed y_1 = dataset_1.y assert np.count_nonzero(y_1) == n_positives / 2 y_2 = dataset_2.y assert np.count_nonzero(y_2) == n_positives / 2
def test_sklearn_multitask_regression_overfit(self): """Test SKLearn singletask-to-multitask overfits tiny regression data.""" n_tasks = 2 tasks = ["task%d" % task for task in range(n_tasks)] n_samples = 10 n_features = 3 # Generate dummy dataset np.random.seed(123) ids = np.arange(n_samples) X = np.random.rand(n_samples, n_features) y = np.random.rand(n_samples, n_tasks) w = np.ones((n_samples, n_tasks)) dataset = DiskDataset.from_numpy(self.train_dir, X, y, w, ids) verbosity = "high" regression_metric = Metric(metrics.r2_score, verbosity=verbosity, task_averager=np.mean) def model_builder(model_dir): sklearn_model = RandomForestRegressor() return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(tasks, model_builder, self.model_dir) # Fit trained model model.fit(dataset) model.save() # Eval model on train transformers = [] evaluator = Evaluator(model, dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([regression_metric]) assert scores[regression_metric.name] > .7
def transform(self, dataset, bins): """Performs CDF transform on data.""" X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) w_t = w ids_t = ids if self.transform_X: X_t = get_cdf_values(X, self.bins) y_t = y if self.transform_y: print("y will not be transformed by CDFTransformer, for now.") """ y_t = get_cdf_values(y,self.bins) X_t = X """ # TODO (rbharath): Find a more elegant solution to saving the data? shutil.rmtree(dataset.data_dir) os.makedirs(dataset.data_dir) DiskDataset.from_numpy(dataset.data_dir, X_t, y_t, w_t, ids_t)
def test_sklearn_classification(self): """Test that sklearn models can learn on simple classification datasets.""" np.random.seed(123) dataset = sklearn.datasets.load_digits(n_class=2) X, y = dataset.data, dataset.target frac_train = .7 n_samples = len(X) n_train = int(frac_train * n_samples) X_train, y_train = X[:n_train], y[:n_train] X_test, y_test = X[n_train:], y[n_train:] train_dataset = DiskDataset.from_numpy(self.train_dir, X_train, y_train) test_dataset = DiskDataset.from_numpy(self.test_dir, X_test, y_test) verbosity = "high" classification_metric = Metric(metrics.roc_auc_score, verbosity=verbosity) sklearn_model = LogisticRegression() model = SklearnModel(sklearn_model, self.model_dir) # Fit trained model model.fit(train_dataset) model.save() # Eval model on train transformers = [] train_evaluator = Evaluator(model, train_dataset, transformers, verbosity=verbosity) train_scores = train_evaluator.compute_model_performance( [classification_metric]) # Eval model on test transformers = [] evaluator = Evaluator(model, test_dataset, transformers, verbosity=verbosity) scores = evaluator.compute_model_performance([classification_metric]) assert scores[classification_metric.name] > .5
def load_core_pdbbind_coordinates(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False neighbor_cutoff = 4 max_num_neighbors = 10 # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers featurizer = NeighborListComplexAtomicCoordinates(max_num_neighbors, neighbor_cutoff) # Featurize Dataset features = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_coordinate_features( featurizer, pdb_subdir, pdb_code) features.append(computed_feature) X = np.array(features, dtype - object) w = np.ones_like(y) dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
def transform(self, dataset): """Performs power transform on data.""" X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) w_t = w ids_t = ids n_powers = len(self.powers) if self.transform_X: X_t = np.power(X, self.powers[0]) for i in range(1, n_powers): X_t = np.hstack((X_t, np.power(X, self.powers[i]))) y_t = y if self.transform_y: print("y will not be transformed by PowerTransformer, for now.") """ y_t = np.power(y, self.powers[0]) for i in range(1, n_powers): y_t = np.hstack((y_t,np.power(y, self.powers[i]))) X_t = X """ # TODO (rbharath): Find a more elegant solution to saving the data? shutil.rmtree(dataset.data_dir) os.makedirs(dataset.data_dir) DiskDataset.from_numpy(dataset.data_dir, X_t, y_t, w_t, ids_t)
def test_singletask_stratified_k_fold_split(self): """ Test RandomStratifiedSplitter k-fold class. """ n_samples = 100 n_positives = 20 n_features = 10 n_tasks = 1 X = np.random.rand(n_samples, n_features) y = np.zeros(n_samples) y[:n_positives] = 1 w = np.ones(n_samples) ids = np.arange(n_samples) data_dir = tempfile.mkdtemp() dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) stratified_splitter = RandomStratifiedSplitter() ids_set = set(dataset.ids) K = 5 fold_dirs = [tempfile.mkdtemp() for i in range(K)] fold_datasets = stratified_splitter.k_fold_split(dataset, fold_dirs) for fold in range(K): fold_dataset = fold_datasets[fold] # Verify lengths is 100/k == 20 # Note: This wouldn't work for multitask str # assert len(fold_dataset) == n_samples/K fold_labels = fold_dataset.y # Verify that each fold has n_positives/K = 4 positive examples. assert np.count_nonzero(fold_labels == 1) == n_positives / K # Verify that compounds in this fold are subset of original compounds fold_ids_set = set(fold_dataset.ids) assert fold_ids_set.issubset(ids_set) # Verify that no two folds have overlapping compounds. for other_fold in range(K): if fold == other_fold: continue other_fold_dataset = fold_datasets[other_fold] other_fold_ids_set = set(other_fold_dataset.ids) assert fold_ids_set.isdisjoint(other_fold_ids_set) merge_dir = tempfile.mkdtemp() merged_dataset = DiskDataset.merge(merge_dir, fold_datasets) assert len(merged_dataset) == len(dataset) assert sorted(merged_dataset.ids) == (sorted(dataset.ids))
def test_multitask_data(self): """Test that data associated with a tasks stays associated with it.""" tasks = ["task0", "task1"] n_samples = 100 n_features = 3 n_tasks = len(tasks) # Generate dummy dataset ids = np.array(["C"] * n_samples, dtype=object) X = np.random.rand(n_samples, n_features) y = np.random.randint(2, size=(n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) dataset = DiskDataset.from_numpy(self.train_dir, X, y, w, ids, tasks) np.testing.assert_allclose(X, dataset.X) np.testing.assert_allclose(y, dataset.y) np.testing.assert_allclose(w, dataset.w)
def test_select(self): """Test that dataset select works.""" num_datapoints = 10 num_features = 10 num_tasks = 1 X = np.random.rand(num_datapoints, num_features) y = np.random.randint(2, size=(num_datapoints, num_tasks)) w = np.ones((num_datapoints, num_tasks)) ids = np.array(["id"] * num_datapoints) dataset = DiskDataset.from_numpy(self.data_dir, X, y, w, ids) select_dir = tempfile.mkdtemp() indices = [0, 4, 5, 8] select_dataset = dataset.select(select_dir, indices) X_sel, y_sel, w_sel, ids_sel = (select_dataset.X, select_dataset.y, select_dataset.w, select_dataset.ids) np.testing.assert_array_equal(X[indices], X_sel) np.testing.assert_array_equal(y[indices], y_sel) np.testing.assert_array_equal(w[indices], w_sel) np.testing.assert_array_equal(ids[indices], ids_sel) shutil.rmtree(select_dir)
def test_to_singletask(self): """Test that to_singletask works.""" num_datapoints = 100 num_features = 10 num_tasks = 10 # Generate data X = np.random.rand(num_datapoints, num_features) y = np.random.randint(2, size=(num_datapoints, num_tasks)) w = np.random.randint(2, size=(num_datapoints, num_tasks)) ids = np.array(["id"] * num_datapoints) dataset = DiskDataset.from_numpy(self.train_dir, X, y, w, ids) task_dirs = [] try: for task in range(num_tasks): task_dirs.append(tempfile.mkdtemp()) singletask_datasets = SingletaskToMultitask._to_singletask( dataset, task_dirs) for task in range(num_tasks): singletask_dataset = singletask_datasets[task] X_task, y_task, w_task, ids_task = (singletask_dataset.X, singletask_dataset.y, singletask_dataset.w, singletask_dataset.ids) w_nonzero = w[:, task] != 0 np.testing.assert_array_equal(X_task, X[w_nonzero != 0]) np.testing.assert_array_equal(y_task.flatten(), y[:, task][w_nonzero != 0]) np.testing.assert_array_equal(w_task.flatten(), w[:, task][w_nonzero != 0]) np.testing.assert_array_equal(ids_task, ids[w_nonzero != 0]) finally: # Cleanup for task_dir in task_dirs: shutil.rmtree(task_dir)
def load_sweet(base_dir, reload=True, frac_train=.8): """Load sweet datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") # Load SWEET dataset print("About to load SWEET dataset.") dataset_file = os.path.join( current_dir, "./sweet.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize SWEET dataset print("About to featurize SWEET dataset.") featurizer = CircularFingerprint(size=1024) SWEET_tasks = dataset.columns.values[1:].tolist() loader = DataLoader(tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = DiskDataset(data_dir, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: dataset = transformer.transform(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) num_tasks = 17 num_train = frac_train * len(dataset) SWEET_tasks = SWEET_tasks[:num_tasks] print("Using following tasks") print(SWEET_tasks) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks] w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, SWEET_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, SWEET_tasks) return SWEET_tasks, (train_dataset, valid_dataset), transformers
def load_tox21(base_dir, reload=True, num_train=7200): """Load Tox21 datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train") valid_dir = os.path.join(base_dir, "valid") # Load Tox21 dataset print("About to load Tox21 dataset.") dataset_file = os.path.join(current_dir, "../../datasets/tox21.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize Tox21 dataset print("About to featurize Tox21 dataset.") featurizer = CircularFingerprint(size=1024) tox21_tasks = [ 'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53' ] if not reload or not os.path.exists(data_dir): loader = DataLoader(tasks=tox21_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, data_dir, shard_size=8192) else: dataset = DiskDataset(data_dir, tox21_tasks, reload=True) # Initialize transformers transformers = [BalancingTransformer(transform_w=True, dataset=dataset)] if not reload: print("About to transform data") for transformer in transformers: transformer.transform(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train], y[num_train:] w_train, w_valid = w[:num_train], w[num_train:] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, tox21_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, tox21_tasks) return tox21_tasks, (train_dataset, valid_dataset), transformers
def load_pcba(base_dir, reload=True, frac_train=.8): """Load PCBA datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") # Load PCBA dataset print("About to load PCBA dataset.") dataset_file = os.path.join( current_dir, "../../datasets/pcba.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize PCBA dataset print("About to featurize PCBA dataset.") featurizer = CircularFingerprint(size=1024) PCBA_tasks = [ 'PCBA-1030','PCBA-1379','PCBA-1452','PCBA-1454','PCBA-1457', 'PCBA-1458','PCBA-1460','PCBA-1461','PCBA-1468','PCBA-1469', 'PCBA-1471','PCBA-1479','PCBA-1631','PCBA-1634','PCBA-1688', 'PCBA-1721','PCBA-2100','PCBA-2101','PCBA-2147','PCBA-2242', 'PCBA-2326','PCBA-2451','PCBA-2517','PCBA-2528','PCBA-2546', 'PCBA-2549','PCBA-2551','PCBA-2662','PCBA-2675','PCBA-2676', 'PCBA-411','PCBA-463254','PCBA-485281','PCBA-485290','PCBA-485294', 'PCBA-485297','PCBA-485313','PCBA-485314','PCBA-485341','PCBA-485349', 'PCBA-485353','PCBA-485360','PCBA-485364','PCBA-485367','PCBA-492947', 'PCBA-493208','PCBA-504327','PCBA-504332','PCBA-504333','PCBA-504339', 'PCBA-504444','PCBA-504466','PCBA-504467','PCBA-504706','PCBA-504842', 'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317', 'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590', 'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233', 'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171', 'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288', 'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635', 'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104', 'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979', 'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553', 'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709', 'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881', 'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899', 'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915', 'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995'] loader = DataLoader(tasks=PCBA_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = DiskDataset(data_dir, reload=True) # Initialize transformers transformers = [ BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: transformer.transform(dataset) print("About to perform train/valid/test split.") num_train = frac_train * len(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) num_tasks = 120 PCBA_tasks = PCBA_tasks[:num_tasks] print("Using following tasks") print(PCBA_tasks) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks] w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, PCBA_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, PCBA_tasks) return PCBA_tasks, dataset, transformers
# REPLACE WITH DOWNLOADED PDBBIND EXAMPLE pdbbind_dir = "/tmp/deep-docking/datasets/pdbbind" pdbbind_tasks, dataset, transformers = load_core_pdbbind_grid( pdbbind_dir, base_dir) print("About to perform train/valid/test split.") num_train = .8 * len(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train], y[num_train:] w_train, w_valid = w[:num_train], w[num_train:] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, pdbbind_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, pdbbind_tasks) classification_metric = Metric(metrics.pearson_r2_score, verbosity=verbosity, mode="regression") n_features = dataset.get_data_shape()[0] tensorflow_model = TensorflowMultiTaskRegressor( len(pdbbind_tasks), n_features, model_dir, dropouts=[.25], learning_rate=0.0003, weight_init_stddevs=[.1], batch_size=64, verbosity=verbosity) model = TensorflowModel(tensorflow_model, model_dir) # Fit trained model model.fit(train_dataset, nb_epoch=20)
def load_core_pdbbind_grid(pdbbind_dir, base_dir, reload=True): """Load PDBBind datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") # Load PDBBind dataset labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013") pdb_subdirs = os.path.join(pdbbind_dir, "website-core-set") tasks = ["-logKd/Ki"] print("About to load contents.") contents_df = load_pdbbind_labels(labels_file) ids = contents_df["PDB code"].values y = np.array([float(val) for val in contents_df["-logKd/Ki"].values]) # Define featurizers grid_featurizer = GridFeaturizer( voxel_width=16.0, feature_types="voxel_combined", # TODO(rbharath, enf): Figure out why pi_stack is slow and cation_pi # causes segfaults. #voxel_feature_types=["ecfp", "splif", "hbond", "pi_stack", "cation_pi", #"salt_bridge"], ecfp_power=9, splif_power=9, voxel_feature_types=["ecfp", "splif", "hbond", "salt_bridge"], ecfp_power=9, splif_power=9, parallel=True, flatten=True, verbosity=verbosity) compound_featurizers = [CircularFingerprint(size=1024)] complex_featurizers = [grid_featurizer] # Featurize Dataset features = [] feature_len = None y_inds = [] for ind, pdb_code in enumerate(ids): print("Processing %s" % str(pdb_code)) pdb_subdir = os.path.join(pdb_subdirs, pdb_code) computed_feature = compute_pdbbind_grid_feature( compound_featurizers, complex_featurizers, pdb_subdir, pdb_code) if feature_len is None: feature_len = len(computed_feature) if len(computed_feature) != feature_len: print("Featurization failed for %s!" % pdb_code) continue y_inds.append(ind) features.append(computed_feature) y = y[y_inds] X = np.vstack(features) w = np.ones_like(y) dataset = DiskDataset.from_numpy(data_dir, X, y, w, ids) transformers = [] return tasks, dataset, transformers
def load_sweet(base_dir, reload=True, frac_train=.8): """Load sweet datasets. Does not do train/test split""" # Set some global variables up top reload = True verbosity = "high" model = "logistic" regen = False # Create some directories for analysis # The base_dir holds the results of all analysis if not reload: if os.path.exists(base_dir): shutil.rmtree(base_dir) if not os.path.exists(base_dir): os.makedirs(base_dir) current_dir = os.path.dirname(os.path.realpath(__file__)) #Make directories to store the raw and featurized datasets. data_dir = os.path.join(base_dir, "dataset") train_dir = os.path.join(base_dir, "train_dataset") valid_dir = os.path.join(base_dir, "valid_dataset") # Load SWEET dataset print("About to load SWEET dataset.") dataset_file = os.path.join(current_dir, "./sweet.csv.gz") dataset = load_from_disk(dataset_file) print("Columns of dataset: %s" % str(dataset.columns.values)) print("Number of examples in dataset: %s" % str(dataset.shape[0])) # Featurize SWEET dataset print("About to featurize SWEET dataset.") featurizer = CircularFingerprint(size=1024) SWEET_tasks = dataset.columns.values[1:].tolist() loader = DataLoader(tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) if not reload or not os.path.exists(data_dir): dataset = loader.featurize(dataset_file, data_dir) regen = True else: dataset = DiskDataset(data_dir, reload=True) # Initialize transformers transformers = [BalancingTransformer(transform_w=True, dataset=dataset)] if regen: print("About to transform data") for transformer in transformers: dataset = transformer.transform(dataset) X, y, w, ids = (dataset.X, dataset.y, dataset.w, dataset.ids) num_tasks = 17 num_train = frac_train * len(dataset) SWEET_tasks = SWEET_tasks[:num_tasks] print("Using following tasks") print(SWEET_tasks) X_train, X_valid = X[:num_train], X[num_train:] y_train, y_valid = y[:num_train, :num_tasks], y[num_train:, :num_tasks] w_train, w_valid = w[:num_train, :num_tasks], w[num_train:, :num_tasks] ids_train, ids_valid = ids[:num_train], ids[num_train:] train_dataset = DiskDataset.from_numpy(train_dir, X_train, y_train, w_train, ids_train, SWEET_tasks) valid_dataset = DiskDataset.from_numpy(valid_dir, X_valid, y_valid, w_valid, ids_valid, SWEET_tasks) return SWEET_tasks, (train_dataset, valid_dataset), transformers