示例#1
0
def process_read(
        raw_sig, read_id, model_info, bc_q, caller_conn, snps_data, snps_q,
        mods_q, mods_info, fast5_fn, failed_reads_q, edge_buffer):
    """ Workhorse per-read megalodon function (connects all the parts)
    """
    if model_info.is_cat_mod:
        bc_weights, mod_weights = model_info.run_model(
            raw_sig, mods_info.n_can_state)
        can_nmods = model_info.can_nmods
    else:
        mod_weights, can_nmods = None, None
        bc_weights = model_info.run_model(raw_sig)

    r_post = decode.crf_flipflop_trans_post(bc_weights, log=True)
    if mods_q is not None:
        r_post_w_mods = np.concatenate([r_post, mod_weights], axis=1)
    if not mods_info.do_output_mods:
        mod_weights = None
    r_seq, score, rl_cumsum, mods_scores = decode.decode_post(
        r_post, mods_info.alphabet, mod_weights, can_nmods)
    if bc_q is not None:
        bc_q.put((read_id, r_seq, mods_scores))

    # if no mapping connection return after basecalls are passed out
    if caller_conn is None: return

    # map read and record mapping from reference to query positions
    r_ref_seq, r_to_q_poss, r_ref_pos, r_cigar = mapping.map_read(
        r_seq, read_id, caller_conn)
    np_ref_seq = np.array([
        mh.ALPHABET.find(b) for b in r_ref_seq], dtype=np.uintp)

    # get mapped start in post and run len to mapped bit of output
    post_mapped_start = rl_cumsum[r_ref_pos.q_trim_start]
    mapped_rl_cumsum = rl_cumsum[
        r_ref_pos.q_trim_start:r_ref_pos.q_trim_end + 1] - post_mapped_start

    if snps_q is not None:
        handle_errors(
            func=snps.call_read_snps,
            args=(snps_data, r_ref_pos, edge_buffer, np_ref_seq,
                  mapped_rl_cumsum, r_to_q_poss, r_post, post_mapped_start),
            r_vals=(read_id, r_ref_pos.chrm, r_ref_pos.strand,
                    r_ref_pos.start, r_ref_seq, len(r_seq),
                    r_ref_pos.q_trim_start, r_ref_pos.q_trim_end, r_cigar),
            out_q=snps_q, fast5_fn=fast5_fn, failed_reads_q=failed_reads_q)
    if mods_q is not None:
        handle_errors(
            func=mods.call_read_mods,
            args=(r_ref_pos, edge_buffer, r_ref_seq, np_ref_seq,
                  mapped_rl_cumsum, r_to_q_poss, r_post_w_mods,
                  post_mapped_start, mods_info),
            r_vals=(read_id, r_ref_pos.chrm, r_ref_pos.strand,
                    r_ref_pos.start, r_ref_seq, len(r_seq),
                    r_ref_pos.q_trim_start, r_ref_pos.q_trim_end, r_cigar),
            out_q=mods_q, fast5_fn=fast5_fn, failed_reads_q=failed_reads_q)

    return
示例#2
0
    def basecall_read(
            self, sig_info, return_post_w_mods=True, return_mod_scores=False,
            update_sig_info=False):
        if self.model_type not in (TAI_NAME, FAST5_NAME, PYGUPPY_NAME):
            raise mh.MegaError('Invalid model backend')

        # decoding is performed within pyguppy server, so shortcurcuit return
        # here as other methods require megalodon decoding.
        if self.model_type == PYGUPPY_NAME:
            return self.run_pyguppy_model(
                sig_info, return_post_w_mods, return_mod_scores,
                update_sig_info)

        post_w_mods = mod_weights = None
        if self.model_type == TAI_NAME:
            # run neural network with taiyaki
            if self.is_cat_mod:
                bc_weights, mod_weights = self.run_taiyaki_model(
                    sig_info.raw_signal, self.n_can_state)
            else:
                bc_weights = self.run_taiyaki_model(sig_info.raw_signal)
            # perform forward-backward algorithm on neural net output
            can_post = decode.crf_flipflop_trans_post(bc_weights, log=True)
            if return_post_w_mods and self.is_cat_mod:
                post_w_mods = np.concatenate([can_post, mod_weights], axis=1)
            # set mod_weights to None if mod_scores not requested to
            # avoid extra computation
            if not return_mod_scores:
                mod_weights = None
        else:
            # FAST5 stored posteriors backend
            if self.is_cat_mod:
                # split canonical posteriors and mod transition weights
                # producing desired return arrays
                can_post = np.ascontiguousarray(
                    sig_info.posteriors[:, :self.n_can_state])
                if return_mod_scores or return_post_w_mods:
                    # convert raw neural net mod weights to softmax weights
                    mod_weights = self._softmax_mod_weights(
                        sig_info.posteriors[:, self.n_can_state:])
                    if return_post_w_mods:
                        post_w_mods = np.concatenate(
                            [can_post, mod_weights], axis=1)
                    if not return_mod_scores:
                        mod_weights = None
            else:
                can_post = sig_info.posteriors

        # decode posteriors to sequence and per-base mod scores
        r_seq, _, rl_cumsum, mods_scores = decode.decode_post(
            can_post, self.can_alphabet, mod_weights, self.can_nmods)
        # TODO implement quality extraction for taiyaki and fast5 modes
        r_qual = None

        return (r_seq, r_qual, rl_cumsum, can_post, sig_info, post_w_mods,
                mods_scores)
def process_read(raw_sig,
                 read_id,
                 model_info,
                 caller_conn,
                 map_thr_buf,
                 do_false_ref,
                 context_bases=CONTEXT_BASES,
                 edge_buffer=EDGE_BUFFER,
                 max_indel_len=MAX_INDEL_LEN,
                 all_paths=ALL_PATHS,
                 every_n=TEST_EVERY_N_LOCS,
                 max_pos_per_read=MAX_POS_PER_READ):
    if model_info.is_cat_mod:
        bc_weights, mod_weights = model_info.run_model(
            raw_sig, n_can_state=model_info.n_can_state)
    else:
        bc_weights = model_info.run_model(raw_sig)

    r_post = decode.crf_flipflop_trans_post(bc_weights, log=True)
    r_seq, score, rl_cumsum, _ = decode.decode_post(r_post, mh.ALPHABET)

    r_ref_seq, r_to_q_poss, r_ref_pos, _ = mapping.map_read(
        r_seq, read_id, caller_conn)
    np_ref_seq = np.array([mh.ALPHABET.find(b) for b in r_ref_seq],
                          dtype=np.uintp)
    if np_ref_seq.shape[0] < edge_buffer * 2:
        raise NotImplementedError(
            'Mapping too short for calibration statistic computation.')
    # get mapped start in post and run len to mapped bit of output
    post_mapped_start = rl_cumsum[r_ref_pos.q_trim_start]
    mapped_rl_cumsum = rl_cumsum[r_ref_pos.q_trim_start:r_ref_pos.q_trim_end +
                                 1] - post_mapped_start

    # candidate SNP locations within a read
    snp_poss = list(
        range(edge_buffer, np_ref_seq.shape[0] - edge_buffer,
              every_n))[:max_pos_per_read]
    read_snp_calls = []

    if do_false_ref:
        # first process reference false calls (need to spoof an incorrect
        # reference for mapping and signal remapping)
        for r_snp_pos in snp_poss:
            # first test single base swap SNPs
            try:
                score, snp_ref_seq, snp_alt_seq = call_alt_true_indel(
                    0, r_snp_pos, r_ref_seq, r_seq, map_thr_buf, context_bases,
                    r_post, rl_cumsum, all_paths)
                read_snp_calls.append((False, score, snp_ref_seq, snp_alt_seq))
            except mh.MegaError:
                # introduced error either causes read not to map or
                # mapping trims the location of interest
                pass
            # then test small indels
            for indel_size in range(1, max_indel_len + 1):
                try:
                    score, snp_ref_seq, snp_alt_seq = call_alt_true_indel(
                        indel_size, r_snp_pos, r_ref_seq, r_seq, map_thr_buf,
                        context_bases, r_post, rl_cumsum, all_paths)
                    read_snp_calls.append(
                        (False, score, snp_ref_seq, snp_alt_seq))
                except mh.MegaError:
                    pass
                try:
                    score, snp_ref_seq, snp_alt_seq = call_alt_true_indel(
                        -indel_size, r_snp_pos, r_ref_seq, r_seq, map_thr_buf,
                        context_bases, r_post, rl_cumsum, all_paths)
                    read_snp_calls.append(
                        (False, score, snp_ref_seq, snp_alt_seq))
                except mh.MegaError:
                    pass

    # now test reference correct SNPs
    for r_snp_pos in snp_poss:
        # test simple SNP first
        snp_ref_seq = r_ref_seq[r_snp_pos]
        for snp_alt_seq in CAN_BASES_SET.difference(snp_ref_seq):
            score = call_snp(r_post,
                             post_mapped_start,
                             r_snp_pos,
                             mapped_rl_cumsum,
                             r_to_q_poss,
                             snp_ref_seq,
                             snp_alt_seq,
                             context_bases,
                             all_paths,
                             np_ref_seq=np_ref_seq)
            read_snp_calls.append((True, score, snp_ref_seq, snp_alt_seq))

        # then test indels
        for indel_size in range(1, max_indel_len + 1):
            # test deletion
            snp_ref_seq = r_ref_seq[r_snp_pos:r_snp_pos + indel_size + 1]
            snp_alt_seq = r_ref_seq[r_snp_pos]
            score = call_snp(r_post,
                             post_mapped_start,
                             r_snp_pos,
                             mapped_rl_cumsum,
                             r_to_q_poss,
                             snp_ref_seq,
                             snp_alt_seq,
                             context_bases,
                             all_paths,
                             np_ref_seq=np_ref_seq)
            read_snp_calls.append((True, score, snp_ref_seq, snp_alt_seq))

            # test random insertion
            snp_ref_seq = r_ref_seq[r_snp_pos]
            snp_alt_seq = snp_ref_seq + ''.join(
                choice(CAN_BASES) for _ in range(indel_size))
            score = call_snp(r_post,
                             post_mapped_start,
                             r_snp_pos,
                             mapped_rl_cumsum,
                             r_to_q_poss,
                             snp_ref_seq,
                             snp_alt_seq,
                             context_bases,
                             all_paths,
                             np_ref_seq=np_ref_seq)
            read_snp_calls.append((True, score, snp_ref_seq, snp_alt_seq))

    return read_snp_calls
示例#4
0
    def basecall_read(
            self, sig_info, return_post_w_mods=True, return_mod_scores=False,
            update_sig_info=False, signal_reversed=False,
            seq_summ_info=None):
        if self.model_type not in (TAI_NAME, FAST5_NAME, PYGUPPY_NAME):
            raise mh.MegaError('Invalid model backend')

        # decoding is performed within pyguppy server, so shortcurcuit return
        # here as other methods require megalodon decoding.
        if self.model_type == PYGUPPY_NAME:
            return self.run_pyguppy_model(
                sig_info, return_post_w_mods, return_mod_scores,
                update_sig_info, signal_reversed, seq_summ_info)

        post_w_mods = mod_weights = None
        if self.model_type == TAI_NAME:
            # run neural network with taiyaki
            if self.is_cat_mod:
                bc_weights, mod_weights = self.run_taiyaki_model(
                    sig_info.raw_signal, self.n_can_state)
            else:
                bc_weights = self.run_taiyaki_model(sig_info.raw_signal)
            # perform forward-backward algorithm on neural net output
            can_post = decode.crf_flipflop_trans_post(bc_weights, log=True)
            if return_post_w_mods and self.is_cat_mod:
                post_w_mods = np.concatenate([can_post, mod_weights], axis=1)
            # set mod_weights to None if mod_scores not requested to
            # avoid extra computation
            if not return_mod_scores:
                mod_weights = None
        else:
            # FAST5 stored posteriors backend
            if self.is_cat_mod:
                # split canonical posteriors and mod transition weights
                # producing desired return arrays
                can_post = np.ascontiguousarray(
                    sig_info.posteriors[:, :self.n_can_state])
                if return_mod_scores or return_post_w_mods:
                    # convert raw neural net mod weights to softmax weights
                    mod_weights = self._softmax_mod_weights(
                        sig_info.posteriors[:, self.n_can_state:])
                    if return_post_w_mods:
                        post_w_mods = np.concatenate(
                            [can_post, mod_weights], axis=1)
                    if not return_mod_scores:
                        mod_weights = None
            else:
                can_post = sig_info.posteriors

        # decode posteriors to sequence and per-base mod scores
        r_seq, _, rl_cumsum, mods_scores = decode.decode_post(
            can_post, self.can_alphabet, mod_weights, self.can_nmods)
        # TODO implement quality extraction for taiyaki and fast5 modes
        # and add mean_qscore_template to seq summary
        r_qual = None

        if seq_summ_info is not None:
            try:
                # update seq summary info with basecalling info
                seq_summ_info = seq_summ_info._replace(
                    template_start=seq_summ_info.start_time,
                    template_duration='{:.6f}'.format(
                        sig_info.dacs.shape[0] /
                        sig_info.channel_info[mh.CHAN_INFO_SAMP_RATE]),
                    sequence_length_template=len(r_seq),
                    median_template='{:.4f}'.format(sig_info.scale_params[0]),
                    mad_template='{:.4f}'.format(sig_info.scale_params[1]))
            except Exception:
                pass

        return (r_seq, r_qual, rl_cumsum, can_post, sig_info, post_w_mods,
                mods_scores, seq_summ_info)