def train_valid_test_split(self, dataset, train_dir=None, valid_dir=None, test_dir=None, frac_train=.8, frac_valid=.1, frac_test=.1, seed=None, log_every_n=1000, verbose=True): """ Splits self into train/validation/test sets. Returns Dataset objects. """ log("Computing train/valid/test indices", self.verbose) train_inds, valid_inds, test_inds = self.split(dataset, frac_train=frac_train, frac_test=frac_test, frac_valid=frac_valid, log_every_n=log_every_n) if train_dir is None: train_dir = tempfile.mkdtemp() if valid_dir is None: valid_dir = tempfile.mkdtemp() if test_dir is None: test_dir = tempfile.mkdtemp() train_dataset = dataset.select(train_inds, train_dir) if frac_valid != 0: valid_dataset = dataset.select(valid_inds, valid_dir) else: valid_dataset = None test_dataset = dataset.select(test_inds, test_dir) return train_dataset, valid_dataset, test_dataset
def featurize(self, protein_file, pockets, pocket_atoms_map, pocket_coords, verbose=False): """ Calculate atomic coodinates. """ import mdtraj protein = mdtraj.load(protein_file) n_pockets = len(pockets) n_residues = len(BindingPocketFeaturizer.residues) res_map = dict(zip(BindingPocketFeaturizer.residues, range(n_residues))) all_features = torch.zeros((n_pockets, n_residues)) for pocket_num, (pocket, coords) in enumerate(zip(pockets, pocket_coords)): pocket_atoms = pocket_atoms_map[pocket] for ind, atom in enumerate(pocket_atoms): atom_name = str(protein.top.atom(atom)) # atom_name is of format RESX-ATOMTYPE # where X is a 1 to 4 digit number residue = atom_name[:3] if residue not in res_map: log("Warning: Non-standard residue in PDB file", verbose) continue atomtype = atom_name.split("-")[1] all_features[pocket_num, res_map[residue]] += 1 return all_features
def compute_model_performance(self, metrics, csv_out=None, stats_out=None, per_task_metrics=False): """ Computes statistics of model on test data and saves results to csv. Parameters ---------- metrics: list List of Pytorch.Chemistry.metrics.Metric objects csv_out: str, optional Filename to write CSV of model predictions. stats_out: str, optional Filename to write computed statistics. per_task_metrics: bool, optional If true, return computed metric for each task on multitask dataset. """ y = self.dataset.y y = undo_transforms(y, self.output_transformers) w = self.dataset.w if not len(metrics): return {} else: mode = metrics[0].mode y_pred = self.model.predict(self.dataset, self.output_transformers) if mode == "classification": y_pred_print = np.argmax(y_pred, -1) else: y_pred_print = y_pred multitask_scores = {} all_task_scores = {} if csv_out is not None: log("Saving predictions to %s" % csv_out, self.verbose) self.output_predictions(y_pred_print, csv_out) # Compute multitask metrics for metric in metrics: if per_task_metrics: multitask_scores[ metric.name], computed_metrics = metric.compute_metric( y, y_pred, w, per_task_metrics=True) all_task_scores[metric.name] = computed_metrics else: multitask_scores[metric.name] = metric.compute_metric( y, y_pred, w, per_task_metrics=False) if stats_out is not None: log("Saving stats to %s" % stats_out, self.verbose) self.output_statistics(multitask_scores, stats_out) if not per_task_metrics: return multitask_scores else: return multitask_scores, all_task_scores
def fit(self, dataset, nb_epoch=10, batch_size=50, **kwargs): """ Fits a model on data in a Dataset object. """ # TODO(rbharath/enf): We need a structured way to deal with potential GPU # memory overflows. for epoch in range(nb_epoch): log("Starting epoch %s" % str(epoch + 1), self.verbose) losses = [] for (X_batch, y_batch, w_batch, ids_batch) in dataset.iterbatches(batch_size): losses.append(self.fit_on_batch(X_batch, y_batch, w_batch)) log("Avg loss for epoch %d: %f" % (epoch + 1, np.array(losses).mean()), self.verbose)
def __init__(self, tasks, model_builder, model_dir=None, verbose=True): super(SingletaskToMultitask, self).__init__(self, model_dir=model_dir, verbose=verbose) self.tasks = tasks self.task_model_dirs = {} self.model_builder = model_builder log("About to initialize singletask to multitask model", self.verbose) for task in self.tasks: task_model_dir = os.path.join(self.model_dir, str(task)) if not os.path.exists(task_model_dir): os.makedirs(task_model_dir) log("Initializing directory for task %s" % task, self.verbose) self.task_model_dirs[task] = task_model_dir
def fit(self, dataset, **kwargs): """ Updates all singletask models with new information. Warning: This current implementation is only functional for sklearn models. """ if not isinstance(dataset, DiskDataset): raise ValueError( 'SingletaskToMultitask only works with DiskDatasets') log("About to create task-specific datasets", self.verbose) task_datasets = self._create_task_datasets(dataset) for ind, task in enumerate(self.tasks): log("Fitting model for task %s" % task, self.verbose) task_model = self.model_builder(self.task_model_dirs[task]) task_model.fit(task_datasets[ind], **kwargs) task_model.save()
def _create_task_datasets(self, dataset): """Make directories to hold data for tasks""" task_data_dirs = [] for task in self.tasks: task_data_dir = os.path.join(self.model_dir, str(task) + "_data") if os.path.exists(task_data_dir): shutil.rmtree(task_data_dir) os.makedirs(task_data_dir) task_data_dirs.append(task_data_dir) task_datasets = self._to_singletask(dataset, task_data_dirs) for task, task_dataset in zip(self.tasks, task_datasets): log( "Dataset for task %s has shape %s" % (task, str(task_dataset.get_shape())), self.verbose) return task_datasets
def k_fold_split(self, dataset, k, directories=None, seed=None, log_every_n=None, **kwargs): """ Splits compounds into k-folds using stratified sampling. Overriding base class k_fold_split. Parameters ---------- dataset: dc.data.Dataset object Dataset. k: int Number of folds. seed: int (Optional, Default None) Random seed. log_every_n: int (Optional, Default None) Log every n examples (not currently used). Returns ------- fold_datasets: List List containing dc.data.Dataset objects """ log("Computing K-fold split", self.verbose) if directories is None: directories = [tempfile.mkdtemp() for _ in range(k)] else: assert len(directories) == k y_s = dataset.y[:, self.task_number] sortidx = np.argsort(y_s) sortidx_list = np.array_split(sortidx, k) fold_datasets = [] for fold in range(k): fold_dir = directories[fold] fold_ind = sortidx_list[fold] fold_dataset = dataset.select(fold_ind, fold_dir) fold_datasets.append(fold_dataset) return fold_datasets
def k_fold_split(self, dataset, k, directories=None, **kwargs): """Needs custom implementation due to ragged splits for stratification.""" log("Computing K-fold split", self.verbose) if directories is None: directories = [tempfile.mkdtemp() for _ in range(k)] else: assert len(directories) == k fold_datasets = [] # rem_dataset is remaining portion of dataset rem_dataset = dataset for fold in range(k): # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up # to k-1. frac_fold = 1. / (k - fold) fold_dir = directories[fold] rem_dir = tempfile.mkdtemp() fold_dataset, rem_dataset = self.split(rem_dataset, frac_fold, [fold_dir, rem_dir]) fold_datasets.append(fold_dataset) return fold_datasets
def _to_singletask(dataset, task_dirs): """Transforms a multitask dataset to a collection of singletask datasets.""" tasks = dataset.get_task_names() assert len(tasks) == len(task_dirs) log("Splitting multitask dataset into singletask datasets", dataset.verbose) task_datasets = [ DiskDataset.create_dataset([], task_dirs[task_num], [task]) for (task_num, task) in enumerate(tasks) ] #task_metadata_rows = {task: [] for task in tasks} for shard_num, (X, y, w, ids) in enumerate(dataset.itershards()): log("Processing shard %d" % shard_num, dataset.verbose) basename = "dataset-%d" % shard_num for task_num, task in enumerate(tasks): log("\tTask %s" % task, dataset.verbose) w_task = w[:, task_num] y_task = y[:, task_num] # Extract those datapoints which are present for this task X_nonzero = X[w_task != 0] num_datapoints = X_nonzero.shape[0] y_nonzero = np.reshape(y_task[w_task != 0], (num_datapoints, 1)) w_nonzero = np.reshape(w_task[w_task != 0], (num_datapoints, 1)) ids_nonzero = ids[w_task != 0] task_datasets[task_num].add_shard(X_nonzero, y_nonzero, w_nonzero, ids_nonzero) return task_datasets
def featurize_complexes(self, mol_pdbs, protein_pdbs, verbose=True, log_every_n=1000): """ Calculate features for mol/protein complexes. Parameters ---------- mol_pdbs: list List of PDBs for molecules. Each PDB should be a list of lines of the PDB file. protein_pdbs: list List of PDBs for proteins. Each PDB should be a list of lines of the PDB file. """ features = [] for i, (mol_pdb, protein_pdb) in enumerate(zip(mol_pdbs, protein_pdbs)): if verbose and i % log_every_n == 0: log("Featurizing %d / %d" % (i, len(mol_pdbs))) features.append(self._featurize_complex(mol_pdb, protein_pdb)) features = np.asarray(features) return features
def split(self, dataset, frac_train=.8, frac_valid=.1, frac_test=.1, log_every_n=1000): """ Splits internal compounds into train/validation/test by scaffold. """ np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) scaffolds = {} log("About to generate scaffolds", self.verbose) data_len = len(dataset) for ind, smiles in enumerate(dataset.ids): if ind % log_every_n == 0: log("Generating scaffold %d/%d" % (ind, data_len), self.verbose) scaffold = generate_scaffold(smiles) if scaffold not in scaffolds: scaffolds[scaffold] = [ind] else: scaffolds[scaffold].append(ind) # Sort from largest to smallest scaffold sets scaffolds = {key: sorted(value) for key, value in scaffolds.items()} scaffold_sets = [ scaffold_set for (scaffold, scaffold_set) in sorted(scaffolds.items(), key=lambda x: (len(x[1]), x[1][0]), reverse=True) ] train_cutoff = frac_train * len(dataset) valid_cutoff = (frac_train + frac_valid) * len(dataset) train_inds, valid_inds, test_inds = [], [], [] log("About to sort in scaffold sets", self.verbose) for scaffold_set in scaffold_sets: if len(train_inds) + len(scaffold_set) > train_cutoff: if len(train_inds) + len(valid_inds) + len( scaffold_set) > valid_cutoff: test_inds += scaffold_set else: valid_inds += scaffold_set else: train_inds += scaffold_set return train_inds, valid_inds, test_inds
def k_fold_split(self, dataset, k, directories=None, **kwargs): """ Parameters ---------- dataset: Dataset Dataset to do a k-fold split k: int number of folds directories: list of str list of length 2*k filepaths to save the result disk-datasets kwargs Returns ------- list of length k tuples of (train, cv) """ """ :param dataset: :param k: :param directories: :param kwargs: :return: list of length k tuples of (train, cv) """ log("Computing K-fold split", self.verbose) if directories is None: directories = [tempfile.mkdtemp() for _ in range(2 * k)] else: assert len(directories) == 2 * k cv_datasets = [] train_ds_base = None train_datasets = [] # rem_dataset is remaining portion of dataset if isinstance(dataset, DiskDataset): rem_dataset = dataset else: rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) for fold in range(k): # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up # to k-1. frac_fold = 1. / (k - fold) train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1] fold_inds, rem_inds, _ = self.split(rem_dataset, frac_train=frac_fold, frac_valid=1 - frac_fold, frac_test=0) cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir) cv_datasets.append(cv_dataset) rem_dataset = rem_dataset.select(rem_inds) train_ds_to_merge = filter(lambda x: x is not None, [train_ds_base, rem_dataset]) train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge) train_dataset = DiskDataset.merge(train_ds_to_merge, merge_dir=train_dir) train_datasets.append(train_dataset) update_train_base_merge = filter(lambda x: x is not None, [train_ds_base, cv_dataset]) train_ds_base = DiskDataset.merge(update_train_base_merge) return list(zip(train_datasets, cv_datasets))
def hyperparam_search(self, params_dict, train_dataset, valid_dataset, output_transformers, metric, use_max=True, logdir=None): """Perform hyperparams search according to params_dict. Each key to hyperparams_dict is a model_param. The values should be a list of potential values for that hyperparam. TODO(rbharath): This shouldn't be stored in a temporary directory. """ hyperparams = params_dict.keys() hyperparam_vals = params_dict.values() for hyperparam_list in params_dict.values(): assert isinstance(hyperparam_list, collections.Iterable) number_combinations = reduce(mul, [len(vals) for vals in hyperparam_vals]) if use_max: best_validation_score = -np.inf else: best_validation_score = np.inf best_hyperparams = None best_model, best_model_dir = None, None all_scores = {} for ind, hyperparameter_tuple in enumerate( itertools.product(*hyperparam_vals)): model_params = {} log("Fitting model %d/%d" % (ind + 1, number_combinations), self.verbose) for hyperparam, hyperparam_val in zip(hyperparams, hyperparameter_tuple): model_params[hyperparam] = hyperparam_val log("hyperparameters: %s" % str(model_params), self.verbose) if logdir is not None: model_dir = os.path.join(logdir, str(ind)) log("model_dir is %s" % model_dir, self.verbose) try: os.makedirs(model_dir) except OSError: if not os.path.isdir(model_dir): log( "Error creating model_dir, using tempfile directory", self.verbose) model_dir = tempfile.mkdtemp() else: model_dir = tempfile.mkdtemp() model = self.model_class(model_params, model_dir) model.fit(train_dataset, **model_params) model.save() evaluator = Evaluator(model, valid_dataset, output_transformers) multitask_scores = evaluator.compute_model_performance([metric]) valid_score = multitask_scores[metric.name] all_scores[str(hyperparameter_tuple)] = valid_score if (use_max and valid_score >= best_validation_score) or ( not use_max and valid_score <= best_validation_score): best_validation_score = valid_score best_hyperparams = hyperparameter_tuple if best_model_dir is not None: shutil.rmtree(best_model_dir) best_model_dir = model_dir best_model = model else: shutil.rmtree(model_dir) log( "Model %d/%d, Metric %s, Validation set %s: %f" % (ind + 1, number_combinations, metric.name, ind, valid_score), self.verbose) log("\tbest_validation_score so far: %f" % best_validation_score, self.verbose) if best_model is None: log("No models trained correctly.", self.verbose) # arbitrarily return last model best_model, best_hyperparams = model, hyperparameter_tuple return best_model, best_hyperparams, all_scores train_evaluator = Evaluator(best_model, train_dataset, output_transformers) multitask_scores = train_evaluator.compute_model_performance([metric]) train_score = multitask_scores[metric.name] log("Best hyperparameters: %s" % str(best_hyperparams), self.verbose) log("train_score: %f" % train_score, self.verbose) log("validation_score: %f" % best_validation_score, self.verbose) return best_model, best_hyperparams, all_scores