def process_read( raw_sig, read_id, model_info, bc_q, caller_conn, snps_data, snps_q, mods_q, mods_info, fast5_fn, failed_reads_q, edge_buffer): """ Workhorse per-read megalodon function (connects all the parts) """ if model_info.is_cat_mod: bc_weights, mod_weights = model_info.run_model( raw_sig, mods_info.n_can_state) can_nmods = model_info.can_nmods else: mod_weights, can_nmods = None, None bc_weights = model_info.run_model(raw_sig) r_post = decode.crf_flipflop_trans_post(bc_weights, log=True) if mods_q is not None: r_post_w_mods = np.concatenate([r_post, mod_weights], axis=1) if not mods_info.do_output_mods: mod_weights = None r_seq, score, rl_cumsum, mods_scores = decode.decode_post( r_post, mods_info.alphabet, mod_weights, can_nmods) if bc_q is not None: bc_q.put((read_id, r_seq, mods_scores)) # if no mapping connection return after basecalls are passed out if caller_conn is None: return # map read and record mapping from reference to query positions r_ref_seq, r_to_q_poss, r_ref_pos, r_cigar = mapping.map_read( r_seq, read_id, caller_conn) np_ref_seq = np.array([ mh.ALPHABET.find(b) for b in r_ref_seq], dtype=np.uintp) # get mapped start in post and run len to mapped bit of output post_mapped_start = rl_cumsum[r_ref_pos.q_trim_start] mapped_rl_cumsum = rl_cumsum[ r_ref_pos.q_trim_start:r_ref_pos.q_trim_end + 1] - post_mapped_start if snps_q is not None: handle_errors( func=snps.call_read_snps, args=(snps_data, r_ref_pos, edge_buffer, np_ref_seq, mapped_rl_cumsum, r_to_q_poss, r_post, post_mapped_start), r_vals=(read_id, r_ref_pos.chrm, r_ref_pos.strand, r_ref_pos.start, r_ref_seq, len(r_seq), r_ref_pos.q_trim_start, r_ref_pos.q_trim_end, r_cigar), out_q=snps_q, fast5_fn=fast5_fn, failed_reads_q=failed_reads_q) if mods_q is not None: handle_errors( func=mods.call_read_mods, args=(r_ref_pos, edge_buffer, r_ref_seq, np_ref_seq, mapped_rl_cumsum, r_to_q_poss, r_post_w_mods, post_mapped_start, mods_info), r_vals=(read_id, r_ref_pos.chrm, r_ref_pos.strand, r_ref_pos.start, r_ref_seq, len(r_seq), r_ref_pos.q_trim_start, r_ref_pos.q_trim_end, r_cigar), out_q=mods_q, fast5_fn=fast5_fn, failed_reads_q=failed_reads_q) return
def basecall_read( self, sig_info, return_post_w_mods=True, return_mod_scores=False, update_sig_info=False): if self.model_type not in (TAI_NAME, FAST5_NAME, PYGUPPY_NAME): raise mh.MegaError('Invalid model backend') # decoding is performed within pyguppy server, so shortcurcuit return # here as other methods require megalodon decoding. if self.model_type == PYGUPPY_NAME: return self.run_pyguppy_model( sig_info, return_post_w_mods, return_mod_scores, update_sig_info) post_w_mods = mod_weights = None if self.model_type == TAI_NAME: # run neural network with taiyaki if self.is_cat_mod: bc_weights, mod_weights = self.run_taiyaki_model( sig_info.raw_signal, self.n_can_state) else: bc_weights = self.run_taiyaki_model(sig_info.raw_signal) # perform forward-backward algorithm on neural net output can_post = decode.crf_flipflop_trans_post(bc_weights, log=True) if return_post_w_mods and self.is_cat_mod: post_w_mods = np.concatenate([can_post, mod_weights], axis=1) # set mod_weights to None if mod_scores not requested to # avoid extra computation if not return_mod_scores: mod_weights = None else: # FAST5 stored posteriors backend if self.is_cat_mod: # split canonical posteriors and mod transition weights # producing desired return arrays can_post = np.ascontiguousarray( sig_info.posteriors[:, :self.n_can_state]) if return_mod_scores or return_post_w_mods: # convert raw neural net mod weights to softmax weights mod_weights = self._softmax_mod_weights( sig_info.posteriors[:, self.n_can_state:]) if return_post_w_mods: post_w_mods = np.concatenate( [can_post, mod_weights], axis=1) if not return_mod_scores: mod_weights = None else: can_post = sig_info.posteriors # decode posteriors to sequence and per-base mod scores r_seq, _, rl_cumsum, mods_scores = decode.decode_post( can_post, self.can_alphabet, mod_weights, self.can_nmods) # TODO implement quality extraction for taiyaki and fast5 modes r_qual = None return (r_seq, r_qual, rl_cumsum, can_post, sig_info, post_w_mods, mods_scores)
def process_read(raw_sig, read_id, model_info, caller_conn, map_thr_buf, do_false_ref, context_bases=CONTEXT_BASES, edge_buffer=EDGE_BUFFER, max_indel_len=MAX_INDEL_LEN, all_paths=ALL_PATHS, every_n=TEST_EVERY_N_LOCS, max_pos_per_read=MAX_POS_PER_READ): if model_info.is_cat_mod: bc_weights, mod_weights = model_info.run_model( raw_sig, n_can_state=model_info.n_can_state) else: bc_weights = model_info.run_model(raw_sig) r_post = decode.crf_flipflop_trans_post(bc_weights, log=True) r_seq, score, rl_cumsum, _ = decode.decode_post(r_post, mh.ALPHABET) r_ref_seq, r_to_q_poss, r_ref_pos, _ = mapping.map_read( r_seq, read_id, caller_conn) np_ref_seq = np.array([mh.ALPHABET.find(b) for b in r_ref_seq], dtype=np.uintp) if np_ref_seq.shape[0] < edge_buffer * 2: raise NotImplementedError( 'Mapping too short for calibration statistic computation.') # get mapped start in post and run len to mapped bit of output post_mapped_start = rl_cumsum[r_ref_pos.q_trim_start] mapped_rl_cumsum = rl_cumsum[r_ref_pos.q_trim_start:r_ref_pos.q_trim_end + 1] - post_mapped_start # candidate SNP locations within a read snp_poss = list( range(edge_buffer, np_ref_seq.shape[0] - edge_buffer, every_n))[:max_pos_per_read] read_snp_calls = [] if do_false_ref: # first process reference false calls (need to spoof an incorrect # reference for mapping and signal remapping) for r_snp_pos in snp_poss: # first test single base swap SNPs try: score, snp_ref_seq, snp_alt_seq = call_alt_true_indel( 0, r_snp_pos, r_ref_seq, r_seq, map_thr_buf, context_bases, r_post, rl_cumsum, all_paths) read_snp_calls.append((False, score, snp_ref_seq, snp_alt_seq)) except mh.MegaError: # introduced error either causes read not to map or # mapping trims the location of interest pass # then test small indels for indel_size in range(1, max_indel_len + 1): try: score, snp_ref_seq, snp_alt_seq = call_alt_true_indel( indel_size, r_snp_pos, r_ref_seq, r_seq, map_thr_buf, context_bases, r_post, rl_cumsum, all_paths) read_snp_calls.append( (False, score, snp_ref_seq, snp_alt_seq)) except mh.MegaError: pass try: score, snp_ref_seq, snp_alt_seq = call_alt_true_indel( -indel_size, r_snp_pos, r_ref_seq, r_seq, map_thr_buf, context_bases, r_post, rl_cumsum, all_paths) read_snp_calls.append( (False, score, snp_ref_seq, snp_alt_seq)) except mh.MegaError: pass # now test reference correct SNPs for r_snp_pos in snp_poss: # test simple SNP first snp_ref_seq = r_ref_seq[r_snp_pos] for snp_alt_seq in CAN_BASES_SET.difference(snp_ref_seq): score = call_snp(r_post, post_mapped_start, r_snp_pos, mapped_rl_cumsum, r_to_q_poss, snp_ref_seq, snp_alt_seq, context_bases, all_paths, np_ref_seq=np_ref_seq) read_snp_calls.append((True, score, snp_ref_seq, snp_alt_seq)) # then test indels for indel_size in range(1, max_indel_len + 1): # test deletion snp_ref_seq = r_ref_seq[r_snp_pos:r_snp_pos + indel_size + 1] snp_alt_seq = r_ref_seq[r_snp_pos] score = call_snp(r_post, post_mapped_start, r_snp_pos, mapped_rl_cumsum, r_to_q_poss, snp_ref_seq, snp_alt_seq, context_bases, all_paths, np_ref_seq=np_ref_seq) read_snp_calls.append((True, score, snp_ref_seq, snp_alt_seq)) # test random insertion snp_ref_seq = r_ref_seq[r_snp_pos] snp_alt_seq = snp_ref_seq + ''.join( choice(CAN_BASES) for _ in range(indel_size)) score = call_snp(r_post, post_mapped_start, r_snp_pos, mapped_rl_cumsum, r_to_q_poss, snp_ref_seq, snp_alt_seq, context_bases, all_paths, np_ref_seq=np_ref_seq) read_snp_calls.append((True, score, snp_ref_seq, snp_alt_seq)) return read_snp_calls
def basecall_read( self, sig_info, return_post_w_mods=True, return_mod_scores=False, update_sig_info=False, signal_reversed=False, seq_summ_info=None): if self.model_type not in (TAI_NAME, FAST5_NAME, PYGUPPY_NAME): raise mh.MegaError('Invalid model backend') # decoding is performed within pyguppy server, so shortcurcuit return # here as other methods require megalodon decoding. if self.model_type == PYGUPPY_NAME: return self.run_pyguppy_model( sig_info, return_post_w_mods, return_mod_scores, update_sig_info, signal_reversed, seq_summ_info) post_w_mods = mod_weights = None if self.model_type == TAI_NAME: # run neural network with taiyaki if self.is_cat_mod: bc_weights, mod_weights = self.run_taiyaki_model( sig_info.raw_signal, self.n_can_state) else: bc_weights = self.run_taiyaki_model(sig_info.raw_signal) # perform forward-backward algorithm on neural net output can_post = decode.crf_flipflop_trans_post(bc_weights, log=True) if return_post_w_mods and self.is_cat_mod: post_w_mods = np.concatenate([can_post, mod_weights], axis=1) # set mod_weights to None if mod_scores not requested to # avoid extra computation if not return_mod_scores: mod_weights = None else: # FAST5 stored posteriors backend if self.is_cat_mod: # split canonical posteriors and mod transition weights # producing desired return arrays can_post = np.ascontiguousarray( sig_info.posteriors[:, :self.n_can_state]) if return_mod_scores or return_post_w_mods: # convert raw neural net mod weights to softmax weights mod_weights = self._softmax_mod_weights( sig_info.posteriors[:, self.n_can_state:]) if return_post_w_mods: post_w_mods = np.concatenate( [can_post, mod_weights], axis=1) if not return_mod_scores: mod_weights = None else: can_post = sig_info.posteriors # decode posteriors to sequence and per-base mod scores r_seq, _, rl_cumsum, mods_scores = decode.decode_post( can_post, self.can_alphabet, mod_weights, self.can_nmods) # TODO implement quality extraction for taiyaki and fast5 modes # and add mean_qscore_template to seq summary r_qual = None if seq_summ_info is not None: try: # update seq summary info with basecalling info seq_summ_info = seq_summ_info._replace( template_start=seq_summ_info.start_time, template_duration='{:.6f}'.format( sig_info.dacs.shape[0] / sig_info.channel_info[mh.CHAN_INFO_SAMP_RATE]), sequence_length_template=len(r_seq), median_template='{:.4f}'.format(sig_info.scale_params[0]), mad_template='{:.4f}'.format(sig_info.scale_params[1])) except Exception: pass return (r_seq, r_qual, rl_cumsum, can_post, sig_info, post_w_mods, mods_scores, seq_summ_info)