def run_batch_training_from_tuples(): # chr_paths = ["/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_40_980920/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_42_138805/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_43_176010/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_44_574894/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_46_366545/", # "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/2018_10_20_13_47_47_822627/"] chr_paths = ["output/joint_runlength_base_model/2018_11_12_14_23_56_638745/"] trainer = JointClassifierTrainer() all_file_paths = list() for path in chr_paths: file_paths = FileManager.get_all_file_paths_by_type(parent_directory_path=path, file_extension=".pkl") all_file_paths.extend(file_paths) counts = trainer.get_counts_from_tuples(paths=all_file_paths) distribution = trainer.train_model(counts) distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/" distribution_filename = "distribution_" + FileManager.get_datetime_string() print("\nSAVING: ", os.path.join(distribution_output_dir, distribution_filename)) FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
def train_joint_model_from_tuples(tuples_path): training_tuples = load_training_tuples(tuples_path, cutoff=16) print("training tuples loaded: ", len(training_tuples)) distribution = train_model(data=training_tuples) distribution_output_dir = "/home/ryan/code/nanopore_assembly/output/joint_runlength_base_model/distribution/" distribution_filename = "distribution_" + FileManager.get_datetime_string() FileManager.save_object_pickle(object=distribution, filename=distribution_filename, output_dir=distribution_output_dir)
def generate_training_data(data_loader, batch_size, consensus_caller, output_dir, filename_suffix, gap_filterer=None): # datetime_string = FileManager.get_datetime_string() # # output_dir = os.path.join(output_dir, datetime_string) # filename = "joint_distribution_" + datetime_string n_files = len(data_loader) all_training_tuples = list() i = 0 print("testing n windows: ", n_files) for b, batch in enumerate(data_loader): # sys.stdout.write("\r %.2f%% COMPLETED " % (100*b/n_batches)) paths, x_pileup, y_pileup, x_repeat, y_repeat, reversal = batch # print() # print("X PILEUP", x_pileup.shape) # print("Y PILEUP", y_pileup.shape) # print("X REPEAT", x_repeat.shape) # print("Y REPEAT", y_repeat.shape) # print("REVERSAL", reversal.shape) if gap_filterer is not None: try: batch = gap_filterer.filter_batch(batch, plot=False) x_pileup, y_pileup, x_repeat, y_repeat, reversal = batch except ValueError as e: print("ERROR:", e) print("X PILEUP", x_pileup.shape) print("Y PILEUP", y_pileup.shape) print("X REPEAT", x_repeat.shape) print("Y REPEAT", y_repeat.shape) print("REVERSAL", reversal.shape) continue # (n,h,w) shape batch_size, n_channels, height, width = x_pileup.shape for n in range(batch_size): # input shape = (batch_size, n_channels, height, width) # example x_pileup_n shape: (5, 44, 24) # example y_pileup_n shape: (5, 1, 24) # example x_repeat_n shape: (1, 44, 24) # example y_repeat_n shape: (1, 1, 24) x_pileup_n = x_pileup[n, :, :].reshape([n_channels, height, width]) y_pileup_n = y_pileup[n, :, :].reshape([5, 1, width]) x_repeat_n = x_repeat[n, :, :].reshape([1, height, width]) y_repeat_n = y_repeat[n, :, :].reshape([1, width]) reversal_n = reversal[n, :, :].reshape([1, height, width]) truths_vs_observations = get_joint_base_runlength_observations_vs_truth(x_pileup=x_pileup_n, y_pileup=y_pileup_n, x_repeat=x_repeat_n, y_repeat=y_repeat_n, reversal=reversal_n, path=paths[0]) all_training_tuples.extend(truths_vs_observations) if i % 1 == 0: sys.stdout.write("\r " + str(round(i/n_files*100,3)) + "% completed") if i % 10000 == 0 or i == n_files -1: filename = "training_data_" + filename_suffix + "_" + str(i) print("\nSAVING: ", os.path.join(output_dir, filename)) FileManager.save_object_pickle(output_dir=output_dir, filename=filename, object=all_training_tuples) all_training_tuples = list() i += 1 return output_dir