def test_strand_list_no_read_id_multiread(self): """See if reads ids found iterating through strand list containing filenames, not read ids""" strand_list = os.path.join(self.STRAND_LIST_DIR, "strand_list_no_read_id.txt") self._check_found_read_ids( iterate_fast5_reads(self.MULTIREAD_DIR, strand_list=strand_list))
def test_strand_list_no_filename_single_reads(self): """See if read ids found iterating through single-read files in directory with strand list""" strand_list = os.path.join(self.STRAND_LIST_DIR, "strand_list_no_filename.txt") self._check_found_read_ids( iterate_fast5_reads(self.READ_DIR, strand_list=strand_list))
def main(): args = get_parser().parse_args() worker_kwarg_names = ['back_prob', 'localpen', 'minscore', 'trim'] model = helpers.load_model(args.model) fast5_reads = fast5utils.iterate_fast5_reads( args.read_dir, limit=args.limit, strand_list=args.input_strand_list, recursive=args.recursive) with helpers.open_file_or_stdout(args.output) as fh: for res in imap_mp( squiggle_match.worker, fast5_reads, threads=args.jobs, fix_kwargs=helpers.get_kwargs(args, worker_kwarg_names), unordered=True, init=squiggle_match.init_worker, initargs=[model, args.references]): if res is None: continue read_id, sig, score, path, squiggle, bases = res bases = bases.decode('ascii') fh.write('#{} {}\n'.format(read_id, score)) for i, (s, p) in enumerate(zip(sig, path)): fh.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( read_id, i, s, p, bases[p], squiggle[p, 0], squiggle[p, 1], squiggle[p, 2]))
def main(argv): """Main function to process mapping for each read using functions in prepare_mapping_funcs""" args = parser.parse_args() print("Running prepare_mapping using flip-flop remapping") if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) # Make an iterator that yields all the reads we're interested in. fast5_reads = fast5utils.iterate_fast5_reads( args.input_folder, limit=args.limit, strand_list=args.input_strand_list) # Set up arguments (kwargs) for the worker function for each read kwargs = helpers.get_kwargs(args, ['alphabet', 'collapse_alphabet', 'device']) kwargs[ 'per_read_params_dict'] = prepare_mapping_funcs.get_per_read_params_dict_from_tsv( args.input_per_read_params) kwargs['references'] = helpers.fasta_file_to_dict(args.references) kwargs['model'] = helpers.load_model(args.model) workerFunction = prepare_mapping_funcs.oneread_remap # remaps a single read using flip-flip network results = imap_mp(workerFunction, fast5_reads, threads=args.jobs, fix_kwargs=kwargs, unordered=True) # results is an iterable of dicts # each dict is a set of return values from a single read prepare_mapping_funcs.generate_output_from_results(results, args)
def test_strand_list_invalid(self): """Use strand list with no header line. Should throw an exception.""" strand_list = os.path.join(self.STRAND_LIST_DIR, "invalid_strand_list_no_header.txt") with self.assertRaises(Exception): for fn, rid in iterate_fast5_reads(self.MULTIREAD_DIR, strand_list=strand_list): print("Filename=", fn, "read_id=", rid)
def main(): args = parser.parse_args() trim_start, trim_end = args.trim fast5_reads = fast5utils.iterate_fast5_reads( args.input_folder, limit=args.limit, strand_list=args.input_strand_list, recursive=args.recursive) with open_file_or_stdout(args.output) as tsvfile: writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n') # UUID is 32hexdigits and four dashes eg. '43f6a05c-0856-4edc-8cd2-4866d9d60eaa' writer.writerow(['UUID', 'trim_start', 'trim_end', 'shift', 'scale']) results = imap_mp(one_read_shift_scale, fast5_reads, threads=args.jobs) for result in results: if all(result): read_id, shift, scale = result writer.writerow([read_id, trim_start, trim_end, shift, scale])
def test_strand_list_no_read_id_multiread(self): strand_list = os.path.join(self.STRAND_LIST_DIR, "strand_list_no_read_id.txt") self._check_found_read_ids( iterate_fast5_reads(self.MULTIREAD_DIR, strand_list=strand_list))
def test_sequencing_summary_multiread(self): self._check_found_read_ids( iterate_fast5_reads(self.MULTIREAD_DIR, strand_list=self.SEQUENCING_SUMMARY))
def test_strand_list_no_filename_single_reads(self): strand_list = os.path.join(self.STRAND_LIST_DIR, "strand_list_no_filename.txt") self._check_found_read_ids( iterate_fast5_reads(self.READ_DIR, strand_list=strand_list))
def test_no_strand_list_multiread(self): self._check_found_read_ids(iterate_fast5_reads(self.MULTIREAD_DIR))
def test_no_strand_list_single_reads(self): self._check_found_read_ids(iterate_fast5_reads(self.READ_DIR))
def test_no_strand_list_multiread(self): """See if read ids found in multiread file with no strand list""" self._check_found_read_ids(iterate_fast5_reads(self.MULTIREAD_DIR))
def main(): args = get_parser().parse_args() # TODO convert to logging sys.stderr.write("* Initializing reads file search.\n") fast5_reads = fast5utils.iterate_fast5_reads( args.input_folder, limit=args.limit, strand_list=args.input_strand_list, recursive=args.recursive) if args.scaling is not None: sys.stderr.write( "* Loading read scaling parameters from {}.\n".format( args.scaling)) all_read_params = get_per_read_params_dict_from_tsv(args.scaling) input_read_ids = frozenset(rec[1] for rec in fast5_reads) scaling_read_ids = frozenset(all_read_params.keys()) sys.stderr.write("* {} / {} reads have scaling information.\n".format( len(input_read_ids & scaling_read_ids), len(input_read_ids))) fast5_reads = [rec for rec in fast5_reads if rec[ 1] in scaling_read_ids] else: all_read_params = {} sys.stderr.write("* Calling reads.\n") nbase, ncalled, nread, nsample = 0, 0, 0, 0 t0 = time.time() progress = Progress(quiet=args.quiet) startcharacter = '@' if args.fastq else '>' initargs = [args.device, args.model, args.chunk_size, args.overlap, all_read_params, args.alphabet, args.max_concurrent_chunks, args.fastq, args.qscore_scale, args.qscore_offset, args.beam, args.posterior, args.temperature] pool = Pool(args.jobs, initializer=worker_init, initargs=initargs) with open_file_or_stdout(args.output) as fh: for read_id, basecall, qstring, read_nsample in \ pool.imap_unordered(worker, fast5_reads): if basecall is not None and len(basecall) > 0: fh.write("{}{}\n{}\n".format( startcharacter, read_id, basecall[::-1] if args.reverse else basecall)) nbase += len(basecall) ncalled += 1 if args.fastq: fh.write("+\n{}\n".format( qstring[::-1] if args.reverse else qstring)) nread += 1 nsample += read_nsample progress.step() total_time = time.time() - t0 sys.stderr.write( "* Called {} reads in {:.2f}s\n".format(nread, int(total_time))) sys.stderr.write( "* {:7.2f} kbase / s\n".format(nbase / total_time / 1000.0)) sys.stderr.write( "* {:7.2f} ksample / s\n".format(nsample / total_time / 1000.0)) sys.stderr.write("* {} reads failed.\n".format(nread - ncalled)) return
def main(): args = parser.parse_args() device = helpers.set_torch_device(args.device) # TODO convert to logging sys.stderr.write("* Loading model.\n") model = load_model(args.model).to(device) is_cat_mod = isinstance(model.sublayers[-1], layers.GlobalNormFlipFlopCatMod) do_output_mods = args.modified_base_output is not None if do_output_mods and not is_cat_mod: sys.stderr.write( "Cannot output modified bases from canonical base only model.") sys.exit() n_can_states = nstate_flipflop(model.sublayers[-1].nbase) stride = guess_model_stride(model) chunk_size = args.chunk_size * stride chunk_overlap = args.overlap * stride sys.stderr.write("* Initializing reads file search.\n") fast5_reads = list( fast5utils.iterate_fast5_reads(args.input_folder, limit=args.limit, strand_list=args.input_strand_list, recursive=args.recursive)) sys.stderr.write("* Found {} reads.\n".format(len(fast5_reads))) if args.scaling is not None: sys.stderr.write("* Loading read scaling parameters from {}.\n".format( args.scaling)) all_read_params = get_per_read_params_dict_from_tsv(args.scaling) input_read_ids = frozenset(rec[1] for rec in fast5_reads) scaling_read_ids = frozenset(all_read_params.keys()) sys.stderr.write("* {} / {} reads have scaling information.\n".format( len(input_read_ids & scaling_read_ids), len(input_read_ids))) fast5_reads = [ rec for rec in fast5_reads if rec[1] in scaling_read_ids ] else: all_read_params = {} mods_fp = None if do_output_mods: mods_fp = h5py.File(args.modified_base_output) mods_fp.create_group('Reads') mod_long_names = model.sublayers[-1].ordered_mod_long_names sys.stderr.write("* Preparing modified base output: {}.\n".format( ', '.join(map(str, mod_long_names)))) mods_fp.create_dataset('mod_long_names', data=np.array(mod_long_names, dtype='S'), dtype=h5py.special_dtype(vlen=str)) sys.stderr.write("* Calling reads.\n") nbase, ncalled, nread, nsample = 0, 0, 0, 0 t0 = time.time() progress = Progress(quiet=args.quiet) startcharacter = '@' if args.fastq else '>' try: with open_file_or_stdout(args.output) as fh: for read_filename, read_id in fast5_reads: read_params = all_read_params[ read_id] if read_id in all_read_params else None basecall, qstring, read_nsample = process_read( read_filename, read_id, model, chunk_size, chunk_overlap, read_params, n_can_states, stride, args.alphabet, is_cat_mod, mods_fp, args.max_concurrent_chunks, args.fastq, args.qscore_scale, args.qscore_offset) if basecall is not None: fh.write("{}{}\n{}\n".format( startcharacter, read_id, basecall[::-1] if args.reverse else basecall)) nbase += len(basecall) ncalled += 1 if args.fastq: fh.write("+\n{}\n".format( qstring[::-1] if args.reverse else qstring)) nread += 1 nsample += read_nsample progress.step() finally: if mods_fp is not None: mods_fp.close() total_time = time.time() - t0 sys.stderr.write("* Called {} reads in {:.2f}s\n".format( nread, int(total_time))) sys.stderr.write("* {:7.2f} kbase / s\n".format(nbase / total_time / 1000.0)) sys.stderr.write("* {:7.2f} ksample / s\n".format(nsample / total_time / 1000.0)) sys.stderr.write("* {} reads failed.\n".format(nread - ncalled)) return
def test_no_strand_list_single_reads(self): """See if read ids found in single-read file with no strand list""" self._check_found_read_ids(iterate_fast5_reads(self.READ_DIR))
metavar=('beginning', 'end'), help='Number of samples to trim off start and end') parser.add_argument('model', action=FileExists, help='Model file') parser.add_argument('references', action=FileExists, help='Fasta file') parser.add_argument('read_dir', action=FileExists, help='Directory for fast5 reads') if __name__ == '__main__': args = parser.parse_args() worker_kwarg_names = ['back_prob', 'localpen', 'minscore', 'trim'] model = helpers.load_model(args.model) fast5_reads = fast5utils.iterate_fast5_reads( args.read_dir, limit=args.limit, strand_list=args.input_strand_list) for res in imap_mp(squiggle_match.worker, fast5_reads, threads=args.jobs, fix_kwargs=helpers.get_kwargs(args, worker_kwarg_names), unordered=True, init=squiggle_match.init_worker, initargs=[model, args.references]): if res is None: continue read_id, sig, score, path, squiggle, bases = res bases = bases.decode('ascii') print('#{} {}'.format(read_id, score)) for i, (s, p) in enumerate(zip(sig, path)): print('{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(read_id, i, s, p,
def test_sequencing_summary_multiread(self): """See if read ids found using sequencing-summary style strand list""" self._check_found_read_ids( iterate_fast5_reads(self.MULTIREAD_DIR, strand_list=self.SEQUENCING_SUMMARY))
def test_strand_list_multiread(self): """See if read ids found using strand list with multi read fast5s""" strand_list = os.path.join(self.STRAND_LIST_DIR, "strand_list.txt") self._check_found_read_ids( iterate_fast5_reads(self.MULTIREAD_DIR, strand_list=strand_list))
def main(): """Main function to process mapping for each read using functions in prepare_mapping_funcs""" args = parser.parse_args() print("Running prepare_mapping using flip-flop remapping") if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) # Create alphabet and check for consistency modified_bases = [elt[0] for elt in args.mod] canonical_bases = [elt[1] for elt in args.mod] for b in modified_bases: assert len( b ) == 1, "Modified bases must be a single character, got {}".format(b) assert b not in args.alphabet, "Modified base must not be a canonical base, got {}".format( b) for b in canonical_bases: assert len( b ) == 1, "Canonical coding for modified bases must be a single character, got {}".format( b) assert b in args.alphabet, "Canonical coding for modified base must be a canonical base, got {}".format( b) full_alphabet = args.alphabet + ''.join(modified_bases) flat_alphabet = args.alphabet + ''.join(canonical_bases) modification_names = [elt[2] for elt in args.mod] alphabet_info = alphabet.AlphabetInfo(full_alphabet, flat_alphabet, modification_names, do_reorder=True) print("Converting references to labels using {}".format( str(alphabet_info))) # Make an iterator that yields all the reads we're interested in. fast5_reads = fast5utils.iterate_fast5_reads( args.input_folder, limit=args.limit, strand_list=args.input_strand_list, recursive=args.recursive) # Set up arguments (kwargs) for the worker function for each read kwargs = {} kwargs[ 'per_read_params_dict'] = prepare_mapping_funcs.get_per_read_params_dict_from_tsv( args.input_per_read_params) kwargs['model'] = helpers.load_model(args.model) kwargs['alphabet_info'] = alphabet_info kwargs['max_read_length'] = args.max_read_length kwargs['localpen'] = args.localpen # remaps a single read using flip-flip network workerFunction = prepare_mapping_funcs.oneread_remap def iter_jobs(): references = bio.fasta_file_to_dict(args.references, alphabet=full_alphabet) for fn, read_id in fast5_reads: yield fn, read_id, references.get(read_id, None) if args.limit is not None: chunksize = args.limit // (2 * args.jobs) chunksize = int(np.clip(chunksize, 1, 50)) else: chunksize = 50 results = imap_mp(workerFunction, iter_jobs(), threads=args.jobs, fix_kwargs=kwargs, unordered=True, chunksize=chunksize) # results is an iterable of dicts # each dict is a set of return values from a single read prepare_mapping_funcs.generate_output_from_results(results, args.output, alphabet_info)
def main(): args = parser.parse_args() assert args.device != 'cpu', "Flipflop basecalling in taiyaki requires a GPU and for cupy to be installed" device = torch.device(args.device) # TODO convert to logging sys.stderr.write("* Loading model.\n") model = load_model(args.model).to(device) is_cat_mod = isinstance(model.sublayers[-1], layers.GlobalNormFlipFlopCatMod) do_output_mods = args.modified_base_output is not None if do_output_mods and not is_cat_mod: sys.stderr.write( "Cannot output modified bases from canonical base only model.") sys.exit() n_can_states = nstate_flipflop(model.sublayers[-1].nbase) stride = guess_model_stride(model, device=device) chunk_size, chunk_overlap = basecall_helpers.round_chunk_values( args.chunk_size, args.overlap, stride) sys.stderr.write("* Initializing reads file search.\n") fast5_reads = fast5utils.iterate_fast5_reads( args.input_folder, limit=args.limit, strand_list=args.input_strand_list, recursive=args.recursive) mods_fp = None if do_output_mods: mods_fp = h5py.File(args.modified_base_output) mods_fp.create_group('Reads') mod_long_names = model.sublayers[-1].ordered_mod_long_names sys.stderr.write("* Preparing modified base output: {}.\n".format( ', '.join(map(str, mod_long_names)))) mods_fp.create_dataset( 'mod_long_names', data=np.array(mod_long_names, dtype='S'), dtype=h5py.special_dtype(vlen=str)) sys.stderr.write("* Calling reads.\n") nbase, ncalled, nread, nsample = 0, 0, 0, 0 t0 = time.time() progress = Progress(quiet=args.quiet) try: with open_file_or_stdout(args.output) as fh: for read_filename, read_id in fast5_reads: basecall, read_nsample = process_read( read_filename, read_id, model, chunk_size, chunk_overlap, device, n_can_states, stride, args.alphabet, is_cat_mod, mods_fp) if basecall is not None: fh.write(">{}\n{}\n".format(read_id, basecall)) nbase += len(basecall) ncalled += 1 nread += 1 nsample += read_nsample progress.step() finally: if mods_fp is not None: mods_fp.close() total_time = time.time() - t0 sys.stderr.write("* Called {} reads in {}s\n".format(nread, int(total_time))) sys.stderr.write("* {:7.2f} kbase / s\n".format(nbase / total_time / 1000.0)) sys.stderr.write("* {:7.2f} ksample / s\n".format(nsample / total_time / 1000.0)) sys.stderr.write("* {} reads failed.\n".format(nread - ncalled)) return