def basecall(model, reads, chunksize=4000, overlap=500, batchsize=32, reverse=False): reads = (read_chunk for read in reads for read_chunk in split_read(read, chunksize * batchsize)[::-1 if reverse else 1]) chunks = (((read, start, end), chunk(torch.from_numpy(read.signal[start:end]), chunksize, overlap)) for (read, start, end) in reads) batches = ((k, compute_scores(model, batch, reverse=reverse)) for k, batch in batchify(chunks, batchsize=batchsize)) stitched = ((read, stitch(x, chunksize, overlap, end - start, model.stride, reverse=reverse)) for ((read, start, end), x) in unbatchify(batches)) transferred = thread_map(transfer, stitched, n_thread=1) return ((read, concat([part for k, part in parts])) for read, parts in groupby(transferred, lambda x: x[0]))
def basecall(model, reads, aligner=None, beamsize=40, chunksize=4000, overlap=500, batchsize=32, qscores=False, reverse=False): """ Basecalls a set of reads. """ _decode = partial(decode_int8, seqdist=model.seqdist, beamsize=beamsize) reads = (read_chunk for read in reads for read_chunk in split_read(read)[::-1 if reverse else 1]) chunks = (((read, start, end), chunk(torch.from_numpy(read.signal[start:end]), chunksize, overlap)) for (read, start, end) in reads) batches = ( (k, quantise_int8(compute_scores(model, batch, reverse=reverse))) for k, batch in thread_iter(batchify(chunks, batchsize=batchsize))) stitched = ((read, stitch(x, chunksize, overlap, end - start, model.stride, reverse=reverse)) for ((read, start, end), x) in unbatchify(batches)) transferred = thread_map(transfer, stitched, n_thread=1) basecalls = thread_map(_decode, transferred, n_thread=8) basecalls = ((read, ''.join(seq for k, seq in parts)) for read, parts in groupby( basecalls, lambda x: (x[0].parent if hasattr(x[0], 'parent') else x[0]))) basecalls = ((read, { 'sequence': seq, 'qstring': '?' * len(seq) if qscores else '*', 'mean_qscore': 0.0 }) for read, seq in basecalls) if aligner: return align_map(aligner, basecalls) return basecalls
def basecall(model, reads, aligner=None, beamsize=40, chunksize=4000, overlap=500, batchsize=32, qscores=False): """ Basecalls a set of reads. """ split_read_length=400000 _stitch = partial( stitch, start=overlap // 2 // model.stride, end=(chunksize - overlap // 2) // model.stride, ) _decode = partial(decode_int8, seqdist=model.seqdist, beamsize=beamsize) reads = ( ((read, i), x) for read in reads for (i, x) in enumerate(torch.split(torch.from_numpy(read.signal), split_read_length)) ) chunks = ( ((read, chunk(signal, chunksize, overlap, pad_start=True)) for (read, signal) in reads) ) batches = ( (read, quantise_int8(compute_scores(model, batch))) for read, batch in thread_iter(batchify(chunks, batchsize=batchsize)) ) stitched = ((read, _stitch(x)) for (read, x) in unbatchify(batches)) transferred = thread_map(transfer, stitched, n_thread=1) basecalls = thread_map(_decode, transferred, n_thread=8) basecalls = ( (read, ''.join(seq for k, seq in parts)) for read, parts in groupby(basecalls, lambda x: x[0][0]) ) basecalls = ( (read, {'sequence': seq, 'qstring': '?' * len(seq) if qscores else '*', 'mean_qscore': 0.0}) for read, seq in basecalls ) if aligner: return align_map(aligner, basecalls) return basecalls
def call(model, reads_directory, templates, complements, aligner=None, cudapoa=True): temp_reads = read_gen(reads_directory, templates, n_proc=8, cancel=process_cancel()) comp_reads = read_gen(reads_directory, complements, n_proc=8, cancel=process_cancel()) temp_scores = basecall(model, temp_reads, reverse=False) comp_scores = basecall(model, comp_reads, reverse=True) scores = (((r1, r2), (s1, s2)) for (r1, s1), (r2, s2) in zip(temp_scores, comp_scores)) calls = thread_map(decode, scores, n_thread=12) if cudapoa: sequences = ((reads, [ seqs, ]) for reads, seqs in calls if len(seqs) > 2) consensus = (zip(reads, poagen(calls)) for reads, calls in batchify(sequences, 100)) res = ((reads[0], { 'sequence': seq }) for seqs in consensus for reads, seq in seqs) else: sequences = ((reads, seqs) for reads, seqs in calls if len(seqs) > 2) consensus = process_map(poa, sequences, n_proc=4) res = ((reads, { 'sequence': seq }) for reads, seqs in consensus for seq in seqs) if aligner is None: return res return align_map(aligner, res)