def scaffold_test_train_valid_test_split(self): """Test of singletask RF ECFP regression API.""" splittype = "scaffold" input_transforms = [] output_transforms = ["normalize"] model_params = {} tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") featurizer = CircularFingerprint(size=1024) input_file = os.path.join(self.current_dir, input_file) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) # Splits featurized samples into train/test splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) assert len(train_dataset) == 8 assert len(valid_dataset) == 1 assert len(test_dataset) == 1
def test_multitask_tf_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Tensorflow multitask deepchem classification API.""" splittype = "scaffold" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict = {"activation": ["relu"], "momentum": [.9], "batch_size": [50], "init": ["glorot_uniform"], "data_shape": [train_dataset.get_data_shape()], "learning_rate": [1e-3], "decay": [1e-6], "nb_hidden": [1000], "nb_epoch": [1], "nesterov": [False], "dropouts": [(.5,)], "nb_layers": [1], "batchnorm": [False], "layer_sizes": [(1000,)], "weight_init_stddevs": [(.1,)], "bias_init_consts": [(1.,)], "num_classes": [2], "penalty": [0.], "optimizer": ["sgd"], "num_classification_tasks": [len(task_types)] } def model_builder(tasks, task_types, params_dict, logdir, verbosity=None): return TensorflowModel( tasks, task_types, params_dict, logdir, tf_class=TensorflowMultiTaskClassifier, verbosity=verbosity) optimizer = HyperparamOpt(model_builder, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_singletask_scaffold_split(self): """ Test singletask ScaffoldSplitter class. """ solubility_dataset = self.load_solubility_data() scaffold_splitter = ScaffoldSplitter() train_data, valid_data, test_data = \ scaffold_splitter.train_valid_test_split( solubility_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1
def test_multitask_scaffold_split(self): """ Test multitask ScaffoldSplitter class. """ multitask_dataset = self.load_multitask_data() scaffold_splitter = ScaffoldSplitter() train_data, valid_data, test_data = \ scaffold_splitter.train_valid_test_split( multitask_dataset, self.train_dir, self.valid_dir, self.test_dir, frac_train=0.8, frac_valid=0.1, frac_test=0.1) assert len(train_data) == 8 assert len(valid_data) == 1 assert len(test_data) == 1
def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None): """Loads or reloads a small version of MUV dataset.""" # Load MUV dataset raw_dataset = load_from_disk(dataset_file) print("Number of examples in dataset: %s" % str(raw_dataset.shape[0])) print("About to featurize compounds") featurizer = CircularFingerprint(size=1024) MUV_tasks = [ 'MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832' ] loader = DataLoader(tasks=MUV_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, self.data_dir) assert len(dataset) == len(raw_dataset) print("About to split compounds into train/valid/test") splitter = ScaffoldSplitter(verbosity=verbosity) frac_train, frac_valid, frac_test = .8, .1, .1 train_dataset, valid_dataset, test_dataset = \ splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir, log_every_n=1000, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) # Do an approximate comparison since splits are sometimes slightly off from # the exact fraction. assert relative_difference(len(train_dataset), frac_train * len(dataset)) < 1e-3 assert relative_difference(len(valid_dataset), frac_valid * len(dataset)) < 1e-3 assert relative_difference(len(test_dataset), frac_test * len(dataset)) < 1e-3 # TODO(rbharath): Transformers don't play nice with reload! Namely, # reloading will cause the transform to be reapplied. This is undesirable in # almost all cases. Need to understand a method to fix this. transformers = [ BalancingTransformer(transform_w=True, dataset=train_dataset) ] print("Transforming datasets") for dataset in [train_dataset, valid_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) return (len(train_dataset), len(valid_dataset), len(test_dataset))
def _run_muv_experiment(self, dataset_file, reload=False, verbosity=None): """Loads or reloads a small version of MUV dataset.""" # Load MUV dataset raw_dataset = load_from_disk(dataset_file) print("Number of examples in dataset: %s" % str(raw_dataset.shape[0])) print("About to featurize compounds") featurizer = CircularFingerprint(size=1024) MUV_tasks = ['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644', 'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712', 'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652', 'MUV-466', 'MUV-832'] loader = DataLoader(tasks=MUV_tasks, smiles_field="smiles", featurizer=featurizer, verbosity=verbosity) dataset = loader.featurize(dataset_file, self.data_dir) assert len(dataset) == len(raw_dataset) print("About to split compounds into train/valid/test") splitter = ScaffoldSplitter(verbosity=verbosity) frac_train, frac_valid, frac_test = .8, .1, .1 train_dataset, valid_dataset, test_dataset = \ splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir, log_every_n=1000, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid) # Do an approximate comparison since splits are sometimes slightly off from # the exact fraction. assert relative_difference( len(train_dataset), frac_train * len(dataset)) < 1e-3 assert relative_difference( len(valid_dataset), frac_valid * len(dataset)) < 1e-3 assert relative_difference( len(test_dataset), frac_test * len(dataset)) < 1e-3 # TODO(rbharath): Transformers don't play nice with reload! Namely, # reloading will cause the transform to be reapplied. This is undesirable in # almost all cases. Need to understand a method to fix this. transformers = [ BalancingTransformer(transform_w=True, dataset=train_dataset)] print("Transforming datasets") for dataset in [train_dataset, valid_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) return (len(train_dataset), len(valid_dataset), len(test_dataset))
def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Keras multitask deepchem classification API.""" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = [ "task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16" ] n_features = 1024 featurizer = CircularFingerprint(size=n_features) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict = {"n_hidden": [5, 10]} def model_builder(model_params, model_dir): keras_model = MultiTaskDNN(len(tasks), n_features, task_type, dropout=0., **model_params) return KerasModel(keras_model, model_dir) optimizer = HyperparamOpt(model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_multitask_keras_mlp_ECFP_classification_hyperparam_opt(self): """Straightforward test of Keras multitask deepchem classification API.""" task_type = "classification" input_file = os.path.join(self.current_dir, "multitask_example.csv") tasks = ["task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11", "task12", "task13", "task14", "task15", "task16"] task_types = {task: task_type for task in tasks} featurizer = CircularFingerprint(size=1024) loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [] metric = Metric(metrics.matthews_corrcoef, np.mean, mode="classification") params_dict= {"nb_hidden": [5, 10], "activation": ["relu"], "dropout": [.5], "learning_rate": [.01], "momentum": [.9], "nesterov": [False], "decay": [1e-4], "batch_size": [5], "nb_epoch": [2], "init": ["glorot_uniform"], "nb_layers": [1], "batchnorm": [False], "data_shape": [train_dataset.get_data_shape()]} optimizer = HyperparamOpt(MultiTaskDNN, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): """Test of hyperparam_opt with singletask RF ECFP regression API.""" featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset) ] for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) params_dict = {"n_estimators": [10, 100]} metric = Metric(metrics.r2_score) def rf_model_builder(model_params, model_dir): sklearn_model = RandomForestRegressor(**model_params) return SklearnModel(sklearn_model, model_dir) optimizer = HyperparamOpt(rf_model_builder, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, transformers, metric, logdir=None)
def test_singletask_sklearn_rf_ECFP_regression_hyperparam_opt(self): """Test of hyperparam_opt with singletask RF ECFP regression API.""" splittype = "scaffold" featurizer = CircularFingerprint(size=1024) tasks = ["log-solubility"] task_type = "regression" task_types = {task: task_type for task in tasks} input_file = os.path.join(self.current_dir, "example.csv") loader = DataLoader(tasks=tasks, smiles_field=self.smiles_field, featurizer=featurizer, verbosity="low") dataset = loader.featurize(input_file, self.data_dir) splitter = ScaffoldSplitter() train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, self.train_dir, self.valid_dir, self.test_dir) input_transformers = [] output_transformers = [ NormalizationTransformer(transform_y=True, dataset=train_dataset)] transformers = input_transformers + output_transformers for dataset in [train_dataset, test_dataset]: for transformer in transformers: transformer.transform(dataset) params_dict = { "n_estimators": [10, 100], "max_features": ["auto"], "data_shape": train_dataset.get_data_shape() } metric = Metric(metrics.r2_score) optimizer = HyperparamOpt(rf_model_builder, tasks, task_types, verbosity="low") best_model, best_hyperparams, all_results = optimizer.hyperparam_search( params_dict, train_dataset, valid_dataset, output_transformers, metric, logdir=None)