def process_read(read, motifs, can_labs, mod_labs, can_post_indices,
                 model_info, edge_buffer, context_bases):
    sig_info = backends.SIGNAL_DATA(dacs=read.Dacs,
                                    raw_len=read.Dacs.shape[0],
                                    fast5_fn='',
                                    read_id=read.read_id,
                                    stride=model_info.stride,
                                    channel_info={
                                        mh.CHAN_INFO_OFFSET: read.offset,
                                        mh.CHAN_INFO_RANGE: read.range,
                                        mh.CHAN_INFO_DIGI: read.digitisation
                                    })
    seq_summ_info = mh.SEQ_SUMM_INFO('', read.read_id, '', '', '', '', 0, 0)
    post_w_mods = next(
        model_info.iter_basecalled_reads([
            (sig_info, seq_summ_info),
        ],
                                         return_post_w_mods=True))[5]
    # convert read.Ref_to_signal to blocks coordinates with
    # model_info.stride
    ref_block_pos = np.around(read.Ref_to_signal /
                              model_info.stride).astype(int)
    # loop over motif hits in read.Reference
    # then run mods.score_mod_seq on extracted locations
    for ref_pos in iter_motifs(motifs, model_info.can_alphabet, can_labs,
                               read.Reference, context_bases):
        seq_st, seq_en = ref_pos - context_bases, ref_pos + context_bases + 1
        pos_seq = read.Reference[seq_st:seq_en]
        post_st, post_en = ref_block_pos[seq_st], ref_block_pos[seq_en]
        if post_en - post_st < pos_seq.shape[0]:
            continue
        can_seq, mod_seq = can_labs[pos_seq], mod_labs[pos_seq]
        alt_mod_seq = mod_seq.copy()
        if mod_labs[read.Reference[ref_pos]] == 0:
            can_base = model_info.output_alphabet[read.Reference[ref_pos]]
            mod_base = choice(model_info.can_base_mods[can_base])
            alt_mod_lab = model_info.str_to_int_mod_labels[mod_base]
            mod_base = model_info.output_alphabet[int(read.Reference[ref_pos])
                                                  + alt_mod_lab]
            alt_mod_seq[context_bases] = alt_mod_lab
            mod_score = (
                mods.score_mod_seq(post_w_mods, can_seq, mod_seq,
                                   can_post_indices, post_st, post_en) -
                mods.score_mod_seq(post_w_mods, can_seq, alt_mod_seq,
                                   can_post_indices, post_st, post_en))
            yield True, mod_base, mod_score
        else:
            mod_base = model_info.output_alphabet[read.Reference[ref_pos]]
            alt_mod_seq[context_bases] = 0
            mod_score = (
                mods.score_mod_seq(post_w_mods, can_seq, alt_mod_seq,
                                   can_post_indices, post_st, post_en) -
                mods.score_mod_seq(post_w_mods, can_seq, mod_seq,
                                   can_post_indices, post_st, post_en))
            yield False, mod_base, mod_score
예제 #2
0
 def read_generator():
     while True:
         try:
             read_sig_data = signal_q.get(timeout=0.01)
         except queue.Empty:
             continue
         if read_sig_data is None:
             LOGGER.debug("Closing")
             break
         sig_info, seq_summ_info = read_sig_data
         # convert tuples back to namedtuples after multiprocessing queue
         sig_info = backends.SIGNAL_DATA(*sig_info)
         LOGGER.debug("{} Processing".format(sig_info.read_id))
         yield sig_info, mh.SEQ_SUMM_INFO(*seq_summ_info)
 def create_batch_gen():
     for _ in range(reads_per_batch):
         try:
             read_sig_data = sig_q.get(timeout=0.01)
         except queue.Empty:
             continue
         if read_sig_data is None:
             LOGGER.debug('Closing')
             # send signal to end main loop then end this iterator
             gen_conn.send(True)
             break
         sig_info, seq_summ_info = read_sig_data
         # convert tuples back to namedtuples after multiprocessing
         sig_info = backends.SIGNAL_DATA(*sig_info)
         yield sig_info, mh.SEQ_SUMM_INFO(*seq_summ_info)
         LOGGER.debug('{} Processing'.format(sig_info.read_id))
예제 #4
0
def process_read(
    bc_res,
    caller_conn,
    map_thr_buf,
    do_false_ref,
    context_bases=mh.DEFAULT_VAR_CONTEXT_BASES,
    edge_buffer=mh.DEFAULT_EDGE_BUFFER,
    max_indel_len=MAX_INDEL_LEN,
    all_paths=ALL_PATHS,
    every_n=TEST_EVERY_N_LOCS,
    max_pos_per_read=MAX_POS_PER_READ,
):
    (
        sig_info,
        called_read,
        rl_cumsum,
        can_post,
    ) = bc_res
    r_ref_seq, r_to_q_poss, r_ref_pos, _, _ = mapping.map_read(
        caller_conn, called_read, backends.SIGNAL_DATA(*sig_info))[0]
    np_ref_seq = mh.seq_to_int(r_ref_seq)
    if np_ref_seq.shape[0] < edge_buffer * 2:
        raise NotImplementedError(
            "Mapping too short for calibration statistic computation.")
    # get mapped start in post and run len to mapped bit of output
    post_mapped_start = rl_cumsum[r_ref_pos.q_trim_start]
    mapped_rl_cumsum = (
        rl_cumsum[r_ref_pos.q_trim_start:r_ref_pos.q_trim_end + 1] -
        post_mapped_start)

    # candidate variant locations within a read
    var_poss = list(
        range(edge_buffer, np_ref_seq.shape[0] - edge_buffer,
              every_n))[:max_pos_per_read]
    read_var_calls = []

    if do_false_ref:
        # first process reference false calls (need to spoof an incorrect
        # reference for mapping and signal remapping)
        for r_var_pos in var_poss:
            # first test single base swap SNPs
            try:
                score, var_ref_seq, var_alt_seq = call_alt_true_indel(
                    0,
                    r_var_pos,
                    r_ref_seq,
                    called_read.seq,
                    map_thr_buf,
                    context_bases,
                    can_post,
                    rl_cumsum,
                    all_paths,
                )
                read_var_calls.append((False, score, var_ref_seq, var_alt_seq))
            except mh.MegaError:
                # introduced error either causes read not to map,
                # mapping trims the location of interest or invalid ref or alt
                # sequence
                pass
            # then test small indels
            for indel_size in range(1, max_indel_len + 1):
                try:
                    score, var_ref_seq, var_alt_seq = call_alt_true_indel(
                        indel_size,
                        r_var_pos,
                        r_ref_seq,
                        called_read.seq,
                        map_thr_buf,
                        context_bases,
                        can_post,
                        rl_cumsum,
                        all_paths,
                    )
                    read_var_calls.append(
                        (False, score, var_ref_seq, var_alt_seq))
                except mh.MegaError:
                    pass
                try:
                    score, var_ref_seq, var_alt_seq = call_alt_true_indel(
                        -indel_size,
                        r_var_pos,
                        r_ref_seq,
                        called_read.seq,
                        map_thr_buf,
                        context_bases,
                        can_post,
                        rl_cumsum,
                        all_paths,
                    )
                    read_var_calls.append(
                        (False, score, var_ref_seq, var_alt_seq))
                except mh.MegaError:
                    pass

    # now test reference correct variants
    for r_var_pos in var_poss:
        if (len(
                set(r_ref_seq[r_var_pos - max(context_bases):r_var_pos +
                              max_indel_len + 1 +
                              max(context_bases)]).difference(CAN_BASES_SET)) >
                0):
            # skip reference positions with N's in any context
            continue
        # test simple SNP first
        var_ref_seq = r_ref_seq[r_var_pos]
        for var_alt_seq in CAN_BASES_SET.difference(var_ref_seq):
            try:
                score = call_variant(
                    can_post,
                    post_mapped_start,
                    r_var_pos,
                    mapped_rl_cumsum,
                    r_to_q_poss,
                    var_ref_seq,
                    var_alt_seq,
                    context_bases,
                    all_paths,
                    np_ref_seq=np_ref_seq,
                )
            except mh.MegaError:
                # invalid reference or alternative sequence
                continue
            read_var_calls.append((True, score, var_ref_seq, var_alt_seq))

        # then test indels
        for indel_size in range(1, max_indel_len + 1):
            # test deletion
            var_ref_seq = r_ref_seq[r_var_pos:r_var_pos + indel_size + 1]
            var_alt_seq = r_ref_seq[r_var_pos]
            try:
                score = call_variant(
                    can_post,
                    post_mapped_start,
                    r_var_pos,
                    mapped_rl_cumsum,
                    r_to_q_poss,
                    var_ref_seq,
                    var_alt_seq,
                    context_bases,
                    all_paths,
                    np_ref_seq=np_ref_seq,
                )
            except mh.MegaError:
                # invalid reference or alternative sequence
                continue
            read_var_calls.append((True, score, var_ref_seq, var_alt_seq))

            # test random insertion
            var_ref_seq = r_ref_seq[r_var_pos]
            var_alt_seq = var_ref_seq + "".join(
                choice(CAN_BASES) for _ in range(indel_size))
            try:
                score = call_variant(
                    can_post,
                    post_mapped_start,
                    r_var_pos,
                    mapped_rl_cumsum,
                    r_to_q_poss,
                    var_ref_seq,
                    var_alt_seq,
                    context_bases,
                    all_paths,
                    np_ref_seq=np_ref_seq,
                )
            except mh.MegaError:
                # invalid reference or alternative sequence
                continue
            read_var_calls.append((True, score, var_ref_seq, var_alt_seq))

    return read_var_calls