def process_read(read, motifs, can_labs, mod_labs, can_post_indices, model_info, edge_buffer, context_bases): sig_info = backends.SIGNAL_DATA(dacs=read.Dacs, raw_len=read.Dacs.shape[0], fast5_fn='', read_id=read.read_id, stride=model_info.stride, channel_info={ mh.CHAN_INFO_OFFSET: read.offset, mh.CHAN_INFO_RANGE: read.range, mh.CHAN_INFO_DIGI: read.digitisation }) seq_summ_info = mh.SEQ_SUMM_INFO('', read.read_id, '', '', '', '', 0, 0) post_w_mods = next( model_info.iter_basecalled_reads([ (sig_info, seq_summ_info), ], return_post_w_mods=True))[5] # convert read.Ref_to_signal to blocks coordinates with # model_info.stride ref_block_pos = np.around(read.Ref_to_signal / model_info.stride).astype(int) # loop over motif hits in read.Reference # then run mods.score_mod_seq on extracted locations for ref_pos in iter_motifs(motifs, model_info.can_alphabet, can_labs, read.Reference, context_bases): seq_st, seq_en = ref_pos - context_bases, ref_pos + context_bases + 1 pos_seq = read.Reference[seq_st:seq_en] post_st, post_en = ref_block_pos[seq_st], ref_block_pos[seq_en] if post_en - post_st < pos_seq.shape[0]: continue can_seq, mod_seq = can_labs[pos_seq], mod_labs[pos_seq] alt_mod_seq = mod_seq.copy() if mod_labs[read.Reference[ref_pos]] == 0: can_base = model_info.output_alphabet[read.Reference[ref_pos]] mod_base = choice(model_info.can_base_mods[can_base]) alt_mod_lab = model_info.str_to_int_mod_labels[mod_base] mod_base = model_info.output_alphabet[int(read.Reference[ref_pos]) + alt_mod_lab] alt_mod_seq[context_bases] = alt_mod_lab mod_score = ( mods.score_mod_seq(post_w_mods, can_seq, mod_seq, can_post_indices, post_st, post_en) - mods.score_mod_seq(post_w_mods, can_seq, alt_mod_seq, can_post_indices, post_st, post_en)) yield True, mod_base, mod_score else: mod_base = model_info.output_alphabet[read.Reference[ref_pos]] alt_mod_seq[context_bases] = 0 mod_score = ( mods.score_mod_seq(post_w_mods, can_seq, alt_mod_seq, can_post_indices, post_st, post_en) - mods.score_mod_seq(post_w_mods, can_seq, mod_seq, can_post_indices, post_st, post_en)) yield False, mod_base, mod_score
def read_generator(): while True: try: read_sig_data = signal_q.get(timeout=0.01) except queue.Empty: continue if read_sig_data is None: LOGGER.debug("Closing") break sig_info, seq_summ_info = read_sig_data # convert tuples back to namedtuples after multiprocessing queue sig_info = backends.SIGNAL_DATA(*sig_info) LOGGER.debug("{} Processing".format(sig_info.read_id)) yield sig_info, mh.SEQ_SUMM_INFO(*seq_summ_info)
def create_batch_gen(): for _ in range(reads_per_batch): try: read_sig_data = sig_q.get(timeout=0.01) except queue.Empty: continue if read_sig_data is None: LOGGER.debug('Closing') # send signal to end main loop then end this iterator gen_conn.send(True) break sig_info, seq_summ_info = read_sig_data # convert tuples back to namedtuples after multiprocessing sig_info = backends.SIGNAL_DATA(*sig_info) yield sig_info, mh.SEQ_SUMM_INFO(*seq_summ_info) LOGGER.debug('{} Processing'.format(sig_info.read_id))
def process_read( bc_res, caller_conn, map_thr_buf, do_false_ref, context_bases=mh.DEFAULT_VAR_CONTEXT_BASES, edge_buffer=mh.DEFAULT_EDGE_BUFFER, max_indel_len=MAX_INDEL_LEN, all_paths=ALL_PATHS, every_n=TEST_EVERY_N_LOCS, max_pos_per_read=MAX_POS_PER_READ, ): ( sig_info, called_read, rl_cumsum, can_post, ) = bc_res r_ref_seq, r_to_q_poss, r_ref_pos, _, _ = mapping.map_read( caller_conn, called_read, backends.SIGNAL_DATA(*sig_info))[0] np_ref_seq = mh.seq_to_int(r_ref_seq) if np_ref_seq.shape[0] < edge_buffer * 2: raise NotImplementedError( "Mapping too short for calibration statistic computation.") # get mapped start in post and run len to mapped bit of output post_mapped_start = rl_cumsum[r_ref_pos.q_trim_start] mapped_rl_cumsum = ( rl_cumsum[r_ref_pos.q_trim_start:r_ref_pos.q_trim_end + 1] - post_mapped_start) # candidate variant locations within a read var_poss = list( range(edge_buffer, np_ref_seq.shape[0] - edge_buffer, every_n))[:max_pos_per_read] read_var_calls = [] if do_false_ref: # first process reference false calls (need to spoof an incorrect # reference for mapping and signal remapping) for r_var_pos in var_poss: # first test single base swap SNPs try: score, var_ref_seq, var_alt_seq = call_alt_true_indel( 0, r_var_pos, r_ref_seq, called_read.seq, map_thr_buf, context_bases, can_post, rl_cumsum, all_paths, ) read_var_calls.append((False, score, var_ref_seq, var_alt_seq)) except mh.MegaError: # introduced error either causes read not to map, # mapping trims the location of interest or invalid ref or alt # sequence pass # then test small indels for indel_size in range(1, max_indel_len + 1): try: score, var_ref_seq, var_alt_seq = call_alt_true_indel( indel_size, r_var_pos, r_ref_seq, called_read.seq, map_thr_buf, context_bases, can_post, rl_cumsum, all_paths, ) read_var_calls.append( (False, score, var_ref_seq, var_alt_seq)) except mh.MegaError: pass try: score, var_ref_seq, var_alt_seq = call_alt_true_indel( -indel_size, r_var_pos, r_ref_seq, called_read.seq, map_thr_buf, context_bases, can_post, rl_cumsum, all_paths, ) read_var_calls.append( (False, score, var_ref_seq, var_alt_seq)) except mh.MegaError: pass # now test reference correct variants for r_var_pos in var_poss: if (len( set(r_ref_seq[r_var_pos - max(context_bases):r_var_pos + max_indel_len + 1 + max(context_bases)]).difference(CAN_BASES_SET)) > 0): # skip reference positions with N's in any context continue # test simple SNP first var_ref_seq = r_ref_seq[r_var_pos] for var_alt_seq in CAN_BASES_SET.difference(var_ref_seq): try: score = call_variant( can_post, post_mapped_start, r_var_pos, mapped_rl_cumsum, r_to_q_poss, var_ref_seq, var_alt_seq, context_bases, all_paths, np_ref_seq=np_ref_seq, ) except mh.MegaError: # invalid reference or alternative sequence continue read_var_calls.append((True, score, var_ref_seq, var_alt_seq)) # then test indels for indel_size in range(1, max_indel_len + 1): # test deletion var_ref_seq = r_ref_seq[r_var_pos:r_var_pos + indel_size + 1] var_alt_seq = r_ref_seq[r_var_pos] try: score = call_variant( can_post, post_mapped_start, r_var_pos, mapped_rl_cumsum, r_to_q_poss, var_ref_seq, var_alt_seq, context_bases, all_paths, np_ref_seq=np_ref_seq, ) except mh.MegaError: # invalid reference or alternative sequence continue read_var_calls.append((True, score, var_ref_seq, var_alt_seq)) # test random insertion var_ref_seq = r_ref_seq[r_var_pos] var_alt_seq = var_ref_seq + "".join( choice(CAN_BASES) for _ in range(indel_size)) try: score = call_variant( can_post, post_mapped_start, r_var_pos, mapped_rl_cumsum, r_to_q_poss, var_ref_seq, var_alt_seq, context_bases, all_paths, np_ref_seq=np_ref_seq, ) except mh.MegaError: # invalid reference or alternative sequence continue read_var_calls.append((True, score, var_ref_seq, var_alt_seq)) return read_var_calls