def test_002_from_file(self): tmp_file = os.path.join(tempfile.gettempdir(), str(uuid4())) with open(tmp_file, 'w') as fh: fh.write('filename\tjunk\n') for i, fname in enumerate(iterate_fast5(self.path, paths=True)): fh.write('{}\t{}\n'.format(os.path.basename(fname), i)) fnames = list( iterate_fast5(self.path, paths=True, strand_list=tmp_file)) self.assertEqual(len(fnames), 3)
def raw_chunkify_with_identity_main(args): """ Main function for `chunkify.py raw_identity` producing batch file for model training """ if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) print('* Processing data using', args.jobs, 'threads') kwarg_names = ['chunk_len', 'kmer_len', 'min_length', 'trim', 'normalisation', 'downsample_factor', 'interpolation'] i = 0 bad_list = [] chunk_list = [] label_list = [] for res in imap_mp(raw_chunk_worker, fast5_files, threads=args.jobs, unordered=True, fix_kwargs=util.get_kwargs(args, kwarg_names), init=batch.init_chunk_identity_worker, initargs=[args.kmer_len, args.alphabet]): if res is not None: i = util.progress_report(i) (chunks, labels, bad_ev) = res chunk_list.append(chunks) label_list.append(labels) bad_list.append(bad_ev) if chunk_list == []: print("no chunks were produced", file=sys.stderr) sys.exit(1) else: print('\n* Writing out to HDF5') hdf5_attributes = { 'chunk': args.chunk_len, 'downsample_factor': args.downsample_factor, 'input_type': 'raw', 'interpolation': args.interpolation, 'kmer': args.kmer_len, 'normalisation': args.normalisation, 'section': 'template', 'trim': args.trim, 'alphabet': args.alphabet, } blanks_per_chunk = np.concatenate([(l == 0).mean(1) for l in label_list]) blanks = np.percentile(blanks_per_chunk, args.blanks_percentile) util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes, chunk_list, label_list, bad_list)
def chunkify_with_identity_main(args): if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) print('* Processing data using', args.jobs, 'threads') kwarg_names = ['section', 'chunk_len', 'kmer_len', 'min_length', 'trim', 'use_scaled', 'normalisation'] i = 0 bad_list = [] chunk_list = [] label_list = [] for res in imap_mp(batch.chunk_worker, fast5_files, threads=args.jobs, unordered=True, fix_kwargs=util.get_kwargs(args, kwarg_names), init=batch.init_chunk_identity_worker, initargs=[args.kmer_len, args.alphabet]): if res is not None: i = util.progress_report(i) (chunks, labels, bad_ev) = res chunk_list.append(chunks) label_list.append(labels) bad_list.append(bad_ev) if chunk_list == []: print("no chunks were produced", file=sys.stderr) sys.exit(1) else: print('\n* Writing out to HDF5') hdf5_attributes = { 'chunk': args.chunk_len, 'input_type': 'events', 'kmer': args.kmer_len, 'normalisation': args.normalisation, 'scaled': args.use_scaled, 'section': args.section, 'trim': args.trim, 'alphabet': args.alphabet, } util.create_labelled_chunks_hdf5(args.output, args.blanks, hdf5_attributes, chunk_list, label_list, bad_list)
def main(argv): args = parser.parse_args(argv[1:]) if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) print('* Processing data using', args.jobs, 'threads') i = 0 kwarg_names = ['section'] with open(args.output, 'w') as file_handle: for res in imap_mp(reference_extraction_worker, fast5_files, threads=args.jobs, unordered=True, fix_kwargs=util.get_kwargs(args, kwarg_names)): if res is not None: i = util.progress_report(i) file_name, reference = res header = '>{}\n'.format(os.path.basename(os.path.splitext(file_name)[0])) file_handle.write(header) file_handle.write(reference + '\n')
def test_iterate_works_with_strandlist(self): fast5_files = set( iterate_fast5(self.readdir, paths=True, strand_list=self.strand_list)) self.assertTrue(self.strands == fast5_files)
def test_iterate_respects_limits(self): _LIMIT = 2 fast5_files = set(iterate_fast5(self.readdir, paths=True, limit=_LIMIT)) self.assertTrue(len(fast5_files) == _LIMIT)
def test_iterate_returns_all(self): fast5_files = set(iterate_fast5(self.readdir, paths=True)) dir_list = set(glob.glob(os.path.join(self.readdir, '*.fast5'))) self.assertTrue(fast5_files == dir_list)
def raw_chunkify_with_remap_main(args): """ Main function for `chunkify.py raw_remap` producing batch file for model training """ if not args.overwrite: if os.path.exists(args.output): print("Cowardly refusing to overwrite {}".format(args.output)) sys.exit(1) if os.path.exists(args.output_strand_list): print("Cowardly refusing to overwrite {}".format(args.output_strand_list)) sys.exit(2) fast5_files = iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) references = util.fasta_file_to_dict(args.references) print('* Processing data using', args.jobs, 'threads') kwarg_names = ['trim', 'min_prob', 'kmer_len', 'min_length', 'prior', 'slip', 'chunk_len', 'normalisation', 'downsample_factor', 'interpolation', 'open_pore_fraction'] kwargs = util.get_kwargs(args, kwarg_names) kwargs['references'] = references i = 0 compiled_file = helpers.compile_model(args.model, args.compile) bad_list = [] chunk_list = [] label_list = [] if not os.path.isfile(args.output_strand_list): header_line = '\t'.join(['filename', 'nblocks', 'score', 'nstay', 'seqlen', 'start', 'end']) + '\n' with open(args.output_strand_list, 'wt') as slfh: slfh.write(header_line) for res in imap_mp(raw_chunk_remap_worker, fast5_files, threads=args.jobs, fix_kwargs=kwargs, unordered=True, init=batch.init_chunk_remap_worker, initargs=[compiled_file, args.kmer_len, args.alphabet]): if res is not None: i = util.progress_report(i) read, score, nblocks, path, seq, chunks, labels, bad_ev = res chunk_list.append(chunks) label_list.append(labels) bad_list.append(bad_ev) strand_data = [read, nblocks, -score / nblocks, np.sum(np.ediff1d(path, to_begin=1) == 0), len(seq), min(path), max(path)] data_line = '\t'.join([str(x) for x in strand_data]) + '\n' with open(args.output_strand_list, 'at') as slfh: slfh.write(data_line) if compiled_file != args.compile: os.remove(compiled_file) if chunk_list == []: print("no chunks were produced", file=sys.stderr) sys.exit(1) else: print('\n* Writing out to HDF5') hdf5_attributes = { 'chunk': args.chunk_len, 'downsample_factor': args.downsample_factor, 'input_type': 'raw', 'interpolation': args.interpolation, 'kmer': args.kmer_len, 'normalisation': args.normalisation, 'section': 'template', 'trim': args.trim, 'alphabet': args.alphabet, } blanks_per_chunk = np.concatenate([(l == 0).mean(1) for l in label_list]) blanks = np.percentile(blanks_per_chunk, args.blanks_percentile) util.create_labelled_chunks_hdf5(args.output, blanks, hdf5_attributes, chunk_list, label_list, bad_list)
def test_001_recursive(self): fnames = list(iterate_fast5(self.path, paths=True, recursive=True)) self.assertEqual(len(fnames), 5)
def test_000_single_layer(self): fnames = list(iterate_fast5(self.path, paths=True)) self.assertEqual(len(fnames), 3)
] else: kwarg_names = [ 'trim', 'open_pore_fraction', 'kmer_len', 'transducer', 'bad', 'min_prob', 'skip', 'trans', 'alphabet' ] compiled_file = helpers.compile_model(args.model, args.compile) seq_printer = basecall.SeqPrinter(args.kmer_len, datatype=args.datatype, transducer=args.transducer, alphabet=args.alphabet.decode('ascii')) files = iterate_fast5(args.input_folder, paths=True, limit=args.limit, strand_list=args.input_strand_list) nbases = nevents = 0 t0 = time.time() for res in imap_mp(basecall_worker, files, threads=args.jobs, fix_kwargs=util.get_kwargs(args, kwarg_names), unordered=True, init=basecall.init_worker, initargs=[compiled_file]): if res is None: continue read, score, call, nev = res seq_len = seq_printer.write(read, score, call, nev) nbases += seq_len