def train(self, data, verbose=True): """ Train all models and return the best one. Models are evaluated and ranked according to their ROC-AUC on a validation data set. Parameters ---------- data: pysster.Data A Data object providing training and validation data sets. verbose: bool If True, progress information (train/val loss) will be printed throughout the training. Returns ------- results: tuple(pysster.Model, str) The best performing model and an overview table of all models are returned. """ best_model_path = "{}/{}".format( gettempdir(), ''.join(random.choice(string.ascii_uppercase) for _ in range(20))) aucs = [] max_auroc = -1 for i, candidate in enumerate(self.candidates): model = Model(candidate, data) model.train(data, verbose) predictions = model.predict(data, "val") labels = data.get_labels("val") report = utils.performance_report(labels, predictions) roc_auc = np.sum(report[:, 0:-1] * report[:, -1, np.newaxis], axis=0) roc_auc = (roc_auc / np.sum(report[:, -1]))[3] aucs.append(roc_auc) if aucs[-1] > max_auroc: max_auroc = aucs[-1] utils.save_model(model, best_model_path) K.clear_session() K.reset_uids() if not verbose: continue print("\n=== Summary ===") print("Model {}/{} = {:.5f} weighted avg roc-auc".format( i + 1, len(self.candidates), aucs[i])) for param in candidate: if not param in ["input_shape"]: print(" - {}: {}".format(param, candidate[param])) # load the best model (and remove it from disc) model = utils.load_model(best_model_path) remove(best_model_path) remove("{}.h5".format(best_model_path)) # save a formatted summary of all trained models table = self._grid_search_table(aucs) return model, table
def test_utils_save_load_model(self): utils.save_model(self.m1, gettempdir() + "/model") self.assertTrue(isfile(gettempdir() + "/model")) self.assertTrue(isfile(gettempdir() + "/model.h5")) model = utils.load_model(gettempdir() + "/model") self.assertTrue(self.m1.params == model.params) self.assertTrue(self.m1.model.get_config() == model.model.get_config()) for x in range(6): self.assertTrue( np.allclose(self.m1.model.get_weights()[x], model.model.get_weights()[x])) remove(gettempdir() + "/model") remove(gettempdir() + "/model.h5")
def train(self, data, pr_auc=False, verbose=True): """ Train all models and return the best one. Models are evaluated and ranked according to their ROC-AUC or PR-AUC (precision-recall) on a validation data set. Parameters ---------- data: pysster.Data A Data object providing training and validation data sets. pr_auc: bool If True, the area under the precision-recall curve will be maximized instead of the area under the ROC curve verbose: bool If True, progress information (train/val loss) will be printed throughout the training. Returns ------- results: tuple(pysster.Model, str) The best performing model and an overview table of all models are returned. """ best_model_path = "{}/{}".format( gettempdir(), ''.join(random.choice(string.ascii_uppercase) for _ in range(20))) if True == pr_auc: metric_idx = 4 metric_name = "pre-auc" else: metric_idx = 3 metric_name = "roc-auc" metric = [] max_metric = -1 for i, candidate in enumerate(self.candidates): model = Model(candidate, data) model.train(data, verbose) predictions = model.predict(data, "val") labels = data.get_labels("val") report = utils.performance_report(labels, predictions) metric_val = np.sum(report[:, 0:-1] * report[:, -1, np.newaxis], axis=0) metric_val = (metric_val / np.sum(report[:, -1]))[metric_idx] metric.append(metric_val) if metric[-1] > max_metric: max_metric = metric[-1] utils.save_model(model, best_model_path) K.clear_session() K.reset_uids() if not verbose: continue print("\n=== Summary ===") print("Model {}/{} = {:.5f} weighted avg {}".format( i + 1, len(self.candidates), metric[i], metric_name)) for param in candidate: if not param in ["input_shape"]: print(" - {}: {}".format(param, candidate[param])) # load the best model (and remove it from disc) model = utils.load_model(best_model_path) remove(best_model_path) remove("{}.h5".format(best_model_path)) # save a formatted summary of all trained models table = self._grid_search_table(metric, metric_name) return model, table
import os from pysster.Data import Data from pysster import utils from IPython.display import Image DATA = "/mnt/isilon/dbhi_bfx/perry/brian/" #establish output directory output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_test_run_10_17_18_2_feats/" if not os.path.isdir(output_folder): os.makedirs(output_folder) #load the pysster prediction model model = utils.load_model( "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/cgi_ind_exp/pysster_output/run_10_17_18_2_feats/model.pkl" ) add_cgi_features = [ DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__microsat.out" ] add_both_features = [x.replace('cgi.', 'both.') for x in add_cgi_features] indel_len_feat = [ DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__indel_length.out", DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/both.indel.unsample__indel_length.out" ] #load the dataset as data data = Data([
import os from pysster.Data import Data from pysster import utils from IPython.display import Image DATA = "/mnt/isilon/dbhi_bfx/perry/brian/" #establish output directory output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_test_run_1_4_19_kav_8k_each_all_feats/" if not os.path.isdir(output_folder): os.makedirs(output_folder) #load the pysster prediction model model = utils.load_model( "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/cgi_ind_exp/pysster_output/train_run_1_4_19_kav_8k_each/model.pkl" ) add_cgi_features = [ DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__microsat.out", DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__lowmappabilityall.out", DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__notinlowmappabilityall.out", DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__siren_similarRegions_dist1.out", DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__segdupall.out", DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__notinsegdupall.out", DATA + "explore-cgi/data/interim/cgi_ind_exp/add_feat/cgi.indel.unsample__notinrefseq_union_cds.sort.out", DATA +
import os from pysster.Data import Data from pysster import utils from IPython.display import Image DATA = "/mnt/isilon/dbhi_bfx/perry/brian/" #establish output directory output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_run_12_5_18_tar_cgi_kav_both/" if not os.path.isdir(output_folder): os.makedirs(output_folder) #load the pysster prediction model model = utils.load_model( "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/cgi_ind_exp/pysster_output/train_run_10_18_18_all_add_feats_back/model.pkl" ) add_cgi_features = [ DATA + "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.microsat.out", DATA + "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.lowmappabilityall.out", DATA + "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.notinlowmappabilityall.out", DATA + "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.siren_similarRegions_dist1.out", DATA + "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.segdupall.out", DATA + "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.notinsegdupall.out", DATA + "explore-cgi/data/interim/target_exp/add_feats/cgi.indel.notinrefseq_union_cds.sort.out", DATA +
import os from pysster.Data import Data from pysster import utils from IPython.display import Image DATA = "/mnt/isilon/dbhi_bfx/perry/brian/" #establish output directory output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_test_run_ref_fa_10_24_18/" if not os.path.isdir(output_folder): os.makedirs(output_folder) #load the pysster prediction model model = utils.load_model( "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/cgi_ind_exp/pysster_output/train_run_ref_seq_only_sampled_10_22_18/model.pkl" ) #load the dataset as data data = Data([ DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_ref_fa/cgi.indel.unsample.fa.gz", DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_ref_fa/both.indel.unsample.fa.gz" ], ("ACGT")) #run the model of pysster on all of the data set predictions = model.predict(data, "all") predictions labels = data.get_labels("all") labels utils.plot_roc(labels, predictions, output_folder + "roc.png")
else: pass return chroms, starts, ends #establish output directory and take output directory name for csv file of labels and predictions output_folder = DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_output/model_run_12_21_18_kav_cgi_8k_samp_kav_both_8k_samp/" if not os.path.isdir(output_folder): os.makedirs(output_folder) class_file_name = output_folder.split('/')[-2] #load the pysster prediction model model = utils.load_model( "/mnt/isilon/dbhi_bfx/perry/brian/explore_cgi/data/interim/target_exp/pysster_output/target_train_model_no_add_feats_12_21_18/model.pkl" ) #load the dataset as data data = Data([ DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/cgi.indel.sample.fa.gz", DATA + "explore-cgi/data/interim/cgi_ind_exp/pysster_fa/both.indel.sample.fa.gz" ], ("ACGT", "XDI")) #run the model of pysster on all of the data set predictions = model.predict(data, "all") labels = data.get_labels("all")