def test_list_dir(self): """Test if list_dir is working""" canonical = '/'.join(os.path.abspath(__file__).split("/")[:-1]) canonical = os.path.join(canonical, "test_files/minion-reads/canonical") # expected = ["CCAGG_modified.bed", "ecoli_k12_mg1655.fa", "ecoli_k12_mg1655_modified.fa"] expected = \ ["miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read207_strand.fast5", "miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read214_strand.fast5", "miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read280_strand.fast5"] expected_files = sorted([os.path.join(canonical, x) for x in expected]) self.assertEqual(sorted(list_dir(canonical)), expected_files) self.assertEqual(sorted(list_dir(canonical, ext="fast5")), expected_files)
def trim_signal_wrapper(dir, outdir): """Wrapper for trim signal function used for whole directory of signal and label files""" signal_files = list_dir(dir, ext='signal') labels_files = list_dir(dir, ext='label') out_files = [] for signal_f in signal_files: try: file_pre = os.path.splitext(signal_f)[0] f_label = file_pre + '.label' assert os.path.isfile(f_label) outpath = trim_signal(signal_f, f_label, outdir) out_files.append(outpath) except (AssertionError, ValueError) as e: print("cannot find {}".format(f_label), file=sys.stderr) continue return out_files
def find_accuracy(fasta_dir, label_dir): fastas = list_dir(fasta_dir) for fasta in fastas: name = fasta.split('/')[-1].split('.')[0] label_file = os.path.join(label_dir, name + '.label') alignment = create_alignment(fasta, label_file) total_counts, base_counts = alignment_stats(alignment) create_summary_stats(total_counts) return True
def match_label_fasta(fasta_dir, label_dir): """match up label files with fasta files from chiron output""" pairs = [] for fasta in list_dir(fasta_dir, ext='fasta'): pref = os.path.splitext(fasta)[0].split('/')[-1] label = os.path.join(label_dir, pref + '.label') if os.path.exists(label): pairs.append([fasta, label]) else: print("file not found: {}".format(label)) return pairs
def main(): """Main docstring""" start = timer() # label_dir = "/Users/andrewbailey/CLionProjects/nanopore-RNN/chiron/data/raw" label_dir = "/Users/andrewbailey/CLionProjects/nanopore-RNN/nanotensor/visualization/rna_training/training" label_files = list_dir(label_dir, ext='label') rna_event_lengths = [] for label_f in label_files: events = read_label(label_f) rna_event_lengths.extend(events.length) label_dir = "/Users/andrewbailey/CLionProjects/nanopore-RNN/nanotensor/visualization/dna_training/training" label_files = list_dir(label_dir, ext='label') dna_event_lengths = [] for label_f in label_files: events = read_label(label_f) dna_event_lengths.extend(events.length) outpath = "event_hist.png" plot_histogram(rna_event_lengths, dna_event_lengths, outpath) # densityplot_events([rna_event_lengths, dna_event_lengths], "event_density.png") stop = timer() print("Running Time = {} seconds".format(stop - start), file=sys.stderr)
def create_label_chiron_data_args(fast5dir, output_dir, output_name, verbose=False): """Create arguments for label_chiron_data function""" assert os.path.isdir(fast5dir) is True, "fast5 directory does not exist" assert os.path.isdir(output_dir) is True, "output directory does not exist" fast5files = list_dir(fast5dir, ext="fast5") counter = 0 for read in fast5files: name = output_name + str(counter) counter += 1 yield dict(fast5_path=read, output_dir=output_dir, name=name, verbose=verbose)
def main(): """Main docstring""" start = timer() # ont_fasta = "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/test_minion.fa" # ont_fastq = "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/test_minion.fastq" # test_fast5 = "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/canonical/miten_PC_20160820_FNFAD20259_MN17223_mux_scan_AMS_158_R9_WGA_Ecoli_08_20_16_83098_ch467_read35_strand.fast5" # # chiron_fast5_dir = "/Users/andrewbailey/CLionProjects/nanopore-RNN/methylated_test" # ecoli_genome = "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/reference-sequences/ecoli_k12_mg1655.fa" # with open(ecoli_genome, 'r+') as reference: # replace_motif(reference) # fasta_dir = "/Users/andrewbailey/CLionProjects/nanopore-RNN/chiron/test_output/result" # replace_motif(ecoli_genome) files = list_dir( "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/methylated" ) thing = create_label_chiron_data_args( "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/methylated", "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/test_methylated", "methylated_aligned", verbose=True) for args in thing: label_chiron_data(fast5_path=args["fast5_path"], output_dir=args["output_dir"], name=args["name"]) # output = "/Users/andrewbailey/CLionProjects/nanopore-RNN/chiron/test_output/result/all_reads2.fasta" # print(files) # path = cat_files(files, output) # bam = align_to_reference(path, ecoli_genome, # "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/test3.bam", threads=2) # data = get_summary_alignment_stats(bam, ecoli_genome, report_all=True) # print(data) # call_nanoraw("/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/minion-reads/canonical", ecoli_genome, 1) # for x in range(len(data)): # print(x, (data[x])) # # fast5 = Fast5(test_fast5) # create_label_file(fast5, "/Users/andrewbailey/CLionProjects/nanopore-RNN/", "test1") # call_nanoraw(chiron_fast5_dir, ecoli_genome, 2, overwrite=True) # indexed = check_indexed_reference(ecoli_genome) # print(indexed) # # if not indexed: # bwa_index_genome(ecoli_genome) # indexed = check_indexed_reference(ecoli_genome) # # print(indexed) # fast5 = Fast5(test_fast5) # # data = fast5.get_reads(raw=True, scale=False) # # data1 = (next(data)) # # print(data1) # # with open("/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/ch467_read35.signal", 'w+') as file: # # for x in data1: # # file.write(str(x)+' ') # nanoraw_events = fast5.get_corrected_events() # events = nanoraw_events["start", 'length', 'base'] # with open("/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/ch467_read35.label", 'w+') as file: # for event in events: # line = str(event['start']) + ' ' + str(event['start'] + event['length']) + ' ' + str(event['base'] + '\n') # file.write(line) # print(row1) # print(row1[["start", 'length', 'base']]) # print(row1['start']) # # print(row1['length']) # print(row1['base']) stop = timer() print("Running Time = {} seconds".format(stop - start), file=sys.stderr)
def main(in_opts=None): """Main docstring""" start = timer() # allow for a command line to be input into main if in_opts is None: # get arguments from command line or config file command_line = CommandLine() args = get_arguments(command_line) else: command_line = CommandLine(in_opts=in_opts) args = command_line.args try: # args = get_arguments(command_line) # make sure they are right format args = CommandLine.check_args(args) # create directory in the output directory log_dir_path = create_time_directory(args.output_dir) # save config file in log directory save_config_file(args, log_dir_path) # reset output directory to new log directory so files are written to correct location args.output_dir = log_dir_path if args.chiron: call_nanoraw(args.fast5_dir, args.reference, args.num_cpu, overwrite=args.overwrite) arg_generator = create_label_chiron_data_args( args.fast5_dir, args.output_dir, output_name=args.file_prefix, verbose=args.verbose) target = label_chiron_data_multiprocess_wrapper else: log_file = args.log_file print("Using log file {}".format(log_file), file=sys.stderr) # define number of workers and create queues arg_generator = create_training_data_args(log_file, args.file_prefix, args) target = create_training_data if args.debug: for arg in arg_generator: target(arg) else: num_workers = args.num_cpu multiprocess_data(num_workers, target, arg_generator) # if tar or save files create tar archive if args.tar: tar_name = get_tar_name("training_data", args.output_dir, args.nanonet, args.deepnano, args.chiron) file_paths = list_dir(args.output_dir) print("Creating tarball file\n", file=sys.stderr) tar_path = tarball_files(tar_name, file_paths, output_dir=args.output_dir) print("Finished tarball file : {}\n".format(tar_path), file=sys.stderr) if args.save2s3: print("Uploading {} to s3 bucket {}".format( tar_path, args.bucket), file=sys.stderr) upload_file_to_s3(args.bucket, tar_path, tar_name) print("\n# nanotensor - finished creating data set\n", file=sys.stderr) print("\n# nanotensor - finished creating data set\n", file=sys.stderr) # check how long the whole program took stop = timer() print("Running Time = {} seconds".format(stop - start), file=sys.stderr) except Usage as err: command_line.do_usage_and_die(err.msg) return log_dir_path
def __init__(self, args): self.args = args # get correct data input pipeline dataset_options = { "FullSignalSequence": FullSignalSequence, "MotifSequence": MotifSequence, "NumpyEventData": NumpyEventData, "PostProcessGlove": PostProcessGlove, "CharacterEmbedding": CharacterEmbedding, "RandomZInput": RandomZInput } self.Dataset = dataset_options[self.args.CreateDataset.dataset] # pick graph graph_options = { "CtcLoss": CtcLoss, "CrossEntropy": CrossEntropy, "LastLSTMOutput": LastLSTMOutput, "Seq2SeqGenerator": Seq2SeqGenerator } self.Graph = graph_options[self.args.BuildGraph.graph] self.global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) if self.args.train: self.training_files = list_dir( self.args.CreateDataset.training_dir) self.validation_files = list_dir( self.args.CreateDataset.validation_dir) self.training = "CreateDataset" self.validation = "CreateDataset" self.train_op = "train_op" self.training_model = "BuildGraph" self.validation_model = "BuildGraph" self.cost_diff_summary = None self.tower_grads = [] learning_rate = tf.train.exponential_decay(self.args.learning_rate, self.global_step, 100000, 0.96, staircase=True) self.opt = tf.train.AdamOptimizer(learning_rate=learning_rate) elif self.args.test: self.test_files = list_dir(self.args.CreateDataset.test_dir) self.testing = "CreateDataset" self.testing_model = "BuildGraph" self.testing_opts = [] elif self.args.inference: self.inference_files = list_dir( self.args.CreateDataset.inference_dir, ext=self.args.file_ext) self.inference = "CreateDataset" self.inference_model = "BuildGraph" self.inference_opts = [] # load data log.info("Data Loading Started") speed = time_it(self.load_data) log.info("Data Loading took {} seconds to complete".format(speed)) # look for GPU's self.gpu_indexes = test_for_nvidia_gpu(self.args.num_gpu) # initialize model log.info("Initializing Model") speed = time_it(self.initialize_model) log.info( "Model Initialization took {} seconds to complete".format(speed)) self.summaries = tf.summary.merge_all() self.start = datetime.now() if self.args.use_checkpoint: self.model_path = tf.train.latest_checkpoint( self.args.trained_model) else: self.model_path = self.args.trained_model_path
elif self.mode == 1: dataset = tf.data.Dataset.zip((self.datasetX, self.datasetSeq, self.datasetY)) dataset = dataset.batch(self.batch_size) # inference elif self.mode == 2: # inference needs to be done per file dataset = tf.data.Dataset.from_generator( self.load_data_inference, (tf.float32, tf.int32), (tf.TensorShape([self.seq_len]), tf.TensorShape(None))) dataset = dataset.batch(self.batch_size) dataset = dataset.prefetch(buffer_size=self.prefetch_buffer_size) return dataset if __name__ == "__main__": file_list = list_dir("/Users/andrewbailey/CLionProjects/nanopore-RNN/chiron/data/raw") motif = MotifSequence(file_list, mode=0, batch_size=10, verbose=True, seq_len=100, n_epochs=5) full = FullSignalSequence(file_list, mode=0, batch_size=10, verbose=True, seq_len=100, n_epochs=5) file_list = list_dir( "/Users/andrewbailey/CLionProjects/nanopore-RNN/test_files/create_training_files/07Jul-20-11h-28m") full = NumpyEventData(file_list, mode=0, batch_size=10, verbose=True, seq_len=100, n_epochs=5) print("This file is just a library", file=sys.stderr) raise SystemExit