def train_test_split(input_transforms, output_transforms, feature_types, splittype, data_dir): """Saves transformed model.""" samples_dir = os.path.join(data_dir, "samples") samples = FeaturizedSamples(samples_dir, reload_data=True) print("Split data into train/test") train_samples_dir = os.path.join(data_dir, "train-samples") test_samples_dir = os.path.join(data_dir, "test-samples") train_samples, test_samples = samples.train_test_split(splittype, train_samples_dir, test_samples_dir) train_data_dir = os.path.join(data_dir, "train-data") test_data_dir = os.path.join(data_dir, "test-data") print("Generating train dataset.") train_dataset = Dataset(train_data_dir, train_samples, feature_types) print("Generating test dataset.") test_dataset = Dataset(test_data_dir, test_samples, feature_types) print("Transforming train data.") train_dataset.transform(input_transforms, output_transforms) print("Transforming test data.") test_dataset.transform(input_transforms, output_transforms)
def test_API(self): """Straightforward test of multitask deepchem classification API.""" splittype = "scaffold" feature_types = ["ECFP"] output_transforms = [] input_transforms = [] task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. model_params = {"nb_hidden": 10, "activation": "relu", "dropout": .5, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2} model_name = "multitask_deep_classifier" # Featurize input featurizer = DataFeaturizer(tasks=self.tasks, smiles_field=self.smiles_field, verbose=True) feature_files = featurizer.featurize(self.input_file, feature_types, self.feature_dir) # Transform data into arrays for ML samples = FeaturizedSamples(self.samplesdir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, self.train_dir, self.test_dir) train_dataset = Dataset(self.train_dir, train_samples, feature_types) test_dataset = Dataset(self.test_dir, test_samples, feature_types) # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) # Fit model task_types = {task: task_type for task in self.tasks} model_params["data_shape"] = train_dataset.get_data_shape() model = Model.model_builder(model_name, task_types, model_params) model.fit(train_dataset) model.save(self.model_dir) # Eval model on train evaluator = Evaluator(model, test_dataset, verbose=True) with tempfile.NamedTemporaryFile() as test_csv_out: with tempfile.NamedTemporaryFile() as test_stats_out: evaluator.compute_model_performance(test_csv_out, test_stats_out)
def _df_to_numpy(df, feature_types): """Transforms a featurized dataset df into standard set of numpy arrays""" if not set(feature_types).issubset(df.keys()): raise ValueError( "Featurized data does not support requested feature_types.") # perform common train/test split across all tasks n_samples = df.shape[0] sorted_tasks = FeaturizedSamples.get_sorted_task_names(df) n_tasks = len(sorted_tasks) y = df[sorted_tasks].values y = np.reshape(y, (n_samples, n_tasks)) w = np.ones((n_samples, n_tasks)) tensors = [] for _, datapoint in df.iterrows(): feature_list = [] for feature_type in feature_types: feature_list.append(datapoint[feature_type]) features = np.squeeze(np.concatenate(feature_list)) tensors.append(features) x = np.stack(tensors) sorted_ids = df["mol_id"] # Set missing data to have weight zero missing = (y.astype(object) == "") y[missing] = 0. w[missing] = 0. return sorted_ids, x.astype(float), y.astype(float), w.astype(float)
def write_dataset_single(val, data_dir, feature_types): """Writes files for single row (X, y, w, X-transformed, ...) to disk.""" (df_file, df) = val # TODO(rbharath): This is a hack. clean up. if not len(df): return None task_names = FeaturizedSamples.get_sorted_task_names(df) ids, X, y, w = _df_to_numpy(df, feature_types) X_sums, X_sum_squares, X_n = compute_sums_and_nb_sample(X) y_sums, y_sum_squares, y_n = compute_sums_and_nb_sample(y, w) basename = os.path.splitext(os.path.basename(df_file))[0] out_X = os.path.join(data_dir, "%s-X.joblib" % basename) out_X_transformed = os.path.join(data_dir, "%s-X-transformed.joblib" % basename) out_y = os.path.join(data_dir, "%s-y.joblib" % basename) out_y_transformed = os.path.join(data_dir, "%s-y-transformed.joblib" % basename) out_w = os.path.join(data_dir, "%s-w.joblib" % basename) out_ids = os.path.join(data_dir, "%s-ids.joblib" % basename) save_to_disk(X, out_X) save_to_disk(y, out_y) save_to_disk(w, out_w) save_to_disk(ids, out_ids) # TODO(rbharath): Should X be saved to out_X_transformed as well? Since # itershards expects to loop over X-transformed? (Ditto for y/w) return([df_file, task_names, out_ids, out_X, out_X_transformed, out_y, out_y_transformed, out_w, X_sums, X_sum_squares, X_n, y_sums, y_sum_squares, y_n])
def _create_model(self, splittype, feature_types, input_transforms, output_transforms, task_type, model_params, model_name, input_file, tasks, protein_pdb_field=None, ligand_pdb_field=None): """Helper method to create model for test.""" # Featurize input input_file = os.path.join(self.current_dir, input_file) featurizer = DataFeaturizer(tasks=tasks, smiles_field=self.smiles_field, protein_pdb_field=protein_pdb_field, ligand_pdb_field=ligand_pdb_field, verbose=True) feature_files = featurizer.featurize(input_file, feature_types, self.feature_dir) # Transform data into arrays for ML samples = FeaturizedSamples(self.samplesdir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, self.train_dir, self.test_dir) train_dataset = Dataset(self.train_dir, train_samples, feature_types) test_dataset = Dataset(self.test_dir, test_samples, feature_types) # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) # Fit model task_types = {task: task_type for task in tasks} model_params["data_shape"] = train_dataset.get_data_shape() model = Model.model_builder(model_name, task_types, model_params) model.fit(train_dataset) model.save(self.model_dir) # Eval model on train evaluator = Evaluator(model, test_dataset, verbose=True) with tempfile.NamedTemporaryFile() as test_csv_out: with tempfile.NamedTemporaryFile() as test_stats_out: _, _ = evaluator.compute_model_performance( test_csv_out, test_stats_out)