def main(args): sys.stderr.write("> loading model\n") model = load_model( args.model_directory, args.device, weights=int(args.weights), half=args.half, chunksize=args.chunksize, use_rt=args.cudart, ) samples = 0 num_reads = 0 max_read_size = 4e6 dtype = np.float16 if args.half else np.float32 reader = PreprocessReader(args.reads_directory) writer = DecoderWriterPool(model, beamsize=args.beamsize, fastq=args.fastq, reference=args.reference) t0 = time.perf_counter() sys.stderr.write("> calling\n") with writer, reader, torch.no_grad(): while True: read = reader.queue.get() if read is None: break if len(read.signal) > max_read_size: sys.stderr.write("> skipping long read %s (%s samples)\n" % (read.read_id, len(read.signal))) continue num_reads += 1 samples += len(read.signal) raw_data = torch.tensor(read.signal.astype(dtype)) chunks = chunk(raw_data, args.chunksize, args.overlap) posteriors = model(chunks.to(args.device)).cpu().numpy() posteriors = stitch(posteriors, args.overlap // model.stride // 2) writer.queue.put((read, posteriors[:raw_data.shape[0]])) duration = time.perf_counter() - t0 sys.stderr.write("> completed reads: %s\n" % num_reads) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (samples / duration)) sys.stderr.write("> done\n")
def stitch_results(results, length, size, overlap, stride, reverse=False): """ Stitch results together with a given overlap. """ if isinstance(results, dict): return { k: stitch_results(v, length, size, overlap, stride, reverse=reverse) for k, v in results.items() } return stitch(results, size, overlap, length, stride, reverse=reverse)
def basecall(model, reads, beamsize=5, chunksize=0, overlap=0, batchsize=1, qscores=False, reverse=None): """ Basecalls a set of reads. """ chunks = ( (read, chunk(torch.tensor(read.signal), chunksize, overlap)) for read in reads ) scores = unbatchify( (k, compute_scores(model, v)) for k, v in batchify(chunks, batchsize) ) scores = ( (read, {'scores': stitch(v, chunksize, overlap, len(read.signal), model.stride)}) for read, v in scores ) decoder = partial(decode, decode=model.decode, beamsize=beamsize, qscores=qscores, stride=model.stride) basecalls = process_map(decoder, scores, n_proc=4) return basecalls
def main(args): if args.save_ctc and not args.reference: sys.stderr.write("> a reference is needed to output ctc training data\n") exit(1) if args.save_ctc: args.overlap = 900 args.chunksize = 3600 sys.stderr.write("> loading model\n") model = load_model( args.model_directory, args.device, weights=int(args.weights), half=args.half, chunksize=args.chunksize, use_rt=args.cudart, ) if args.reference: sys.stderr.write("> loading reference\n") aligner = Aligner(args.reference, preset='ont-map') if not aligner: sys.stderr.write("> failed to load/build index\n") sys.exit(1) else: aligner = None samples = 0 num_reads = 0 max_read_size = 4e6 dtype = np.float16 if args.half else np.float32 ctc_writer = CTCWriter(model, aligner) reader = PreprocessReader(args.reads_directory) writer = DecoderWriterPool(model, beamsize=args.beamsize, fastq=args.fastq, aligner=aligner) t0 = time.perf_counter() sys.stderr.write("> calling\n") with writer, ctc_writer, reader, torch.no_grad(): while True: read = reader.queue.get() if read is None: break if len(read.signal) > max_read_size: sys.stderr.write("> skipping long read %s (%s samples)\n" % (read.read_id, len(read.signal))) continue num_reads += 1 samples += len(read.signal) raw_data = torch.tensor(read.signal.astype(dtype)) chunks = chunk(raw_data, args.chunksize, args.overlap) posteriors_ = model(chunks.to(args.device)).cpu().numpy() posteriors = stitch(posteriors_, args.overlap // model.stride // 2) writer.queue.put((read, posteriors[:raw_data.shape[0]])) if args.save_ctc and len(raw_data) > args.chunksize: ctc_writer.queue.put((chunks.numpy(), posteriors_)) duration = time.perf_counter() - t0 sys.stderr.write("> completed reads: %s\n" % num_reads) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (samples / duration)) sys.stderr.write("> done\n")
def main(args): if args.save_ctc and not args.reference: sys.stderr.write( "> a reference is needed to output ctc training data\n") exit(1) if args.save_ctc: args.overlap = 900 args.chunksize = 3600 sys.stderr.write("> loading model\n") model = load_model( args.model_directory, args.device, weights=int(args.weights), half=args.half, chunksize=args.chunksize, use_rt=args.cudart, ) if args.reference: sys.stderr.write("> loading reference\n") aligner = Aligner(args.reference, preset='ont-map') if not aligner: sys.stderr.write("> failed to load/build index\n") sys.exit(1) write_sam_header(aligner) else: aligner = None # with open(summary_file(), 'w') as summary: # write_summary_header(summary, alignment=aligner) samples = 0 num_reads = 0 max_read_size = 4e6 read_ids = column_to_set(args.read_ids) dtype = np.float16 if args.half else np.float32 reader = ProcessIterator(get_reads(args.reads_directory, read_ids=read_ids, skip=args.skip), progress=True) writer = ProcessPool(DecoderWriter, model=model, aligner=aligner, beamsize=args.beamsize, fastq=args.fastq) ctc_writer = CTCWriter(model, aligner, min_coverage=args.ctc_min_coverage, min_accuracy=args.ctc_min_accuracy) t0 = time.perf_counter() sys.stderr.write("> calling\n") with writer, ctc_writer, reader, torch.no_grad(): while True: read = reader.queue.get() if read is None: break if len(read.signal) > max_read_size: sys.stderr.write("> skipping long read %s (%s samples)\n" % (read.read_id, len(read.signal))) continue num_reads += 1 samples += len(read.signal) raw_data = torch.tensor(read.signal.astype(dtype)) print('bonito: raw_data.shape: ', raw_data.shape) chunks = chunk(raw_data, args.chunksize, args.overlap) posteriors_ = model(chunks.to(args.device)).cpu().numpy() posteriors = stitch(posteriors_, args.overlap // model.stride // 2) if args.write_basecall: writer.queue.put((read, posteriors[:raw_data.shape[0]])) if args.save_ctc and len(raw_data) > args.chunksize: ctc_writer.queue.put((chunks.numpy(), posteriors_)) print('bonito: posteriors.shape', posteriors.shape) posteriors.tofile(args.post_file) duration = time.perf_counter() - t0 sys.stderr.write("> completed reads: %s\n" % num_reads) sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration))) sys.stderr.write("> samples per second %.1E\n" % (samples / duration)) sys.stderr.write("> done\n")