def call_variant(
        can_post, post_mapped_start, r_var_pos, rl_cumsum, r_to_q_poss,
        var_ref_seq, var_alt_seq, context_bases, all_paths,
        np_ref_seq=None, ref_seq=None):
    var_context_bases = (context_bases[0]
                         if len(var_ref_seq) == len(var_alt_seq) else
                         context_bases[1])
    pos_bb = min(var_context_bases, r_var_pos)
    if ref_seq is None:
        pos_ab = min(var_context_bases,
                     np_ref_seq.shape[0] - r_var_pos - len(var_ref_seq))
        pos_ref_seq = np_ref_seq[r_var_pos - pos_bb:
                                 r_var_pos + pos_ab + len(var_ref_seq)]
    else:
        pos_ab = min(var_context_bases,
                     len(ref_seq) - r_var_pos - len(var_ref_seq))
        pos_ref_seq = mh.seq_to_int(ref_seq[
            r_var_pos - pos_bb:r_var_pos + pos_ab + len(var_ref_seq)])

    pos_alt_seq = np.concatenate([
        pos_ref_seq[:pos_bb], mh.seq_to_int(var_alt_seq),
        pos_ref_seq[pos_bb + len(var_ref_seq):]])
    blk_start = rl_cumsum[r_to_q_poss[r_var_pos - pos_bb]]
    blk_end = rl_cumsum[r_to_q_poss[r_var_pos + pos_ab] + 1]

    if blk_end - blk_start < max(len(pos_ref_seq), len(pos_alt_seq)):
        return np.NAN
    loc_ref_score = variants.score_seq(
        can_post, pos_ref_seq, post_mapped_start + blk_start,
        post_mapped_start + blk_end, all_paths)
    loc_alt_score = variants.score_seq(
        can_post, pos_alt_seq, post_mapped_start + blk_start,
        post_mapped_start + blk_end, all_paths)

    return loc_ref_score - loc_alt_score
예제 #2
0
def _main(args):
    logging.init_logger()
    LOGGER.info("Loading reference")
    aligner = mappy.Aligner(str(args.reference),
                            preset=str("map-ont"),
                            best_n=1)
    LOGGER.info("Loading variants")
    var_data = variants.VarInfo(args.in_vcf,
                                aligner,
                                args.max_indel_size,
                                keep_var_fp_open=True)
    contigs = var_data.variants_idx.header.contigs.values()
    LOGGER.info("Atomizing variants")
    with open(args.out_vcf, "w") as out_vars:
        # preprocess contigs to set contig lengths for VCF header
        ctg_lens = {}
        for ctg in contigs:
            chrm_seq = aligner.seq(ctg.name)
            if len(chrm_seq) != ctg.length:
                LOGGER.warning(
                    ("Mismatched contig lengths ({}) between " +
                     "reference ({}) and input VCF ({}) using length from "
                     "reference").format(ctg.name, len(chrm_seq), ctg.length))
            ctg_lens[ctg.name] = len(chrm_seq)

        out_vars.write("\n".join(HEADER + [
            CONTIG_HEADER_LINE.format(ctg, ctg_len)
            for ctg, ctg_len in ctg_lens.items()
        ] + [
            variants.CONTEXT_BASE_MI_LINE,
            COMMAND_HEADER_LINE.format(" ".join(sys.argv)),
            FIELDS_LINE,
        ]) + "\n")
        for ctg in contigs:
            chrm_seq = aligner.seq(ctg.name)
            map_pos = mapping.MAP_POS(
                chrm=ctg.name,
                strand=None,
                start=0,
                end=len(chrm_seq),
                q_trim_start=None,
                q_trim_end=None,
            )
            for var in var_data.fetch_read_variants(map_pos,
                                                    mh.seq_to_int(chrm_seq)):
                out_vars.write(
                    RECORD_LINE.format(
                        chrm=ctg.name,
                        pos=var.ref_start + 1,
                        rid=var.id,
                        ref=var.ref,
                        alts=",".join(var.alts),
                        info=variants.HAS_CONTEXT_BASE_TAG
                        if var.has_context_base else ".",
                    ))

    LOGGER.info("Indexing output variant file")
    variants.index_variants(args.out_vcf)
예제 #3
0
def _main(args):
    logging.init_logger()
    LOGGER.info('Loading reference')
    aligner = mapping.alignerPlus(str(args.reference),
                                  preset=str('map-ont'),
                                  best_n=1)
    aligner.add_ref_lens()
    LOGGER.info('Loading variants')
    var_data = variants.VarData(args.in_vcf,
                                args.max_indel_size,
                                keep_var_fp_open=True,
                                aligner=aligner)
    contigs = var_data.variants_idx.header.contigs.values()
    LOGGER.info('Atomizing variants')
    with open(args.out_vcf, 'w') as out_vars:
        out_vars.write('\n'.join(HEADER + [
            CONTIG_HEADER_LINE.format(ctg.name, ctg.length) for ctg in contigs
        ] + [
            variants.CONTEXT_BASE_MI_LINE,
            COMMAND_HEADER_LINE.format(' '.join(sys.argv)), FIELDS_LINE
        ]) + '\n')
        for ctg in contigs:
            chrm_seq = aligner.seq(ctg.name)
            if len(chrm_seq) != ctg.length:
                LOGGER.warning(('Mismatched contig lengths ({}) between ' +
                                'reference ({}) and input VCF ({})').format(
                                    ctg.name, len(chrm_seq), ctg.length))
            map_pos = mapping.MAP_POS(chrm=ctg.name,
                                      strand=None,
                                      start=0,
                                      end=len(chrm_seq),
                                      q_trim_start=None,
                                      q_trim_end=None)
            for var in var_data.fetch_read_variants(map_pos,
                                                    mh.seq_to_int(chrm_seq)):
                out_vars.write(
                    RECORD_LINE.format(chrm=ctg.name,
                                       pos=var.ref_start + 1,
                                       rid=var.id,
                                       ref=var.ref,
                                       alts=','.join(var.alts),
                                       info=variants.HAS_CONTEXT_BASE_TAG
                                       if var.has_context_base else '.'))

    LOGGER.info('Indexing output variant file')
    variants.index_variants(args.out_vcf)
예제 #4
0
def process_read(
    bc_res,
    caller_conn,
    map_thr_buf,
    do_false_ref,
    context_bases=mh.DEFAULT_VAR_CONTEXT_BASES,
    edge_buffer=mh.DEFAULT_EDGE_BUFFER,
    max_indel_len=MAX_INDEL_LEN,
    all_paths=ALL_PATHS,
    every_n=TEST_EVERY_N_LOCS,
    max_pos_per_read=MAX_POS_PER_READ,
):
    (
        sig_info,
        called_read,
        rl_cumsum,
        can_post,
    ) = bc_res
    r_ref_seq, r_to_q_poss, r_ref_pos, _, _ = mapping.map_read(
        caller_conn, called_read, backends.SIGNAL_DATA(*sig_info))[0]
    np_ref_seq = mh.seq_to_int(r_ref_seq)
    if np_ref_seq.shape[0] < edge_buffer * 2:
        raise NotImplementedError(
            "Mapping too short for calibration statistic computation.")
    # get mapped start in post and run len to mapped bit of output
    post_mapped_start = rl_cumsum[r_ref_pos.q_trim_start]
    mapped_rl_cumsum = (
        rl_cumsum[r_ref_pos.q_trim_start:r_ref_pos.q_trim_end + 1] -
        post_mapped_start)

    # candidate variant locations within a read
    var_poss = list(
        range(edge_buffer, np_ref_seq.shape[0] - edge_buffer,
              every_n))[:max_pos_per_read]
    read_var_calls = []

    if do_false_ref:
        # first process reference false calls (need to spoof an incorrect
        # reference for mapping and signal remapping)
        for r_var_pos in var_poss:
            # first test single base swap SNPs
            try:
                score, var_ref_seq, var_alt_seq = call_alt_true_indel(
                    0,
                    r_var_pos,
                    r_ref_seq,
                    called_read.seq,
                    map_thr_buf,
                    context_bases,
                    can_post,
                    rl_cumsum,
                    all_paths,
                )
                read_var_calls.append((False, score, var_ref_seq, var_alt_seq))
            except mh.MegaError:
                # introduced error either causes read not to map,
                # mapping trims the location of interest or invalid ref or alt
                # sequence
                pass
            # then test small indels
            for indel_size in range(1, max_indel_len + 1):
                try:
                    score, var_ref_seq, var_alt_seq = call_alt_true_indel(
                        indel_size,
                        r_var_pos,
                        r_ref_seq,
                        called_read.seq,
                        map_thr_buf,
                        context_bases,
                        can_post,
                        rl_cumsum,
                        all_paths,
                    )
                    read_var_calls.append(
                        (False, score, var_ref_seq, var_alt_seq))
                except mh.MegaError:
                    pass
                try:
                    score, var_ref_seq, var_alt_seq = call_alt_true_indel(
                        -indel_size,
                        r_var_pos,
                        r_ref_seq,
                        called_read.seq,
                        map_thr_buf,
                        context_bases,
                        can_post,
                        rl_cumsum,
                        all_paths,
                    )
                    read_var_calls.append(
                        (False, score, var_ref_seq, var_alt_seq))
                except mh.MegaError:
                    pass

    # now test reference correct variants
    for r_var_pos in var_poss:
        if (len(
                set(r_ref_seq[r_var_pos - max(context_bases):r_var_pos +
                              max_indel_len + 1 +
                              max(context_bases)]).difference(CAN_BASES_SET)) >
                0):
            # skip reference positions with N's in any context
            continue
        # test simple SNP first
        var_ref_seq = r_ref_seq[r_var_pos]
        for var_alt_seq in CAN_BASES_SET.difference(var_ref_seq):
            try:
                score = call_variant(
                    can_post,
                    post_mapped_start,
                    r_var_pos,
                    mapped_rl_cumsum,
                    r_to_q_poss,
                    var_ref_seq,
                    var_alt_seq,
                    context_bases,
                    all_paths,
                    np_ref_seq=np_ref_seq,
                )
            except mh.MegaError:
                # invalid reference or alternative sequence
                continue
            read_var_calls.append((True, score, var_ref_seq, var_alt_seq))

        # then test indels
        for indel_size in range(1, max_indel_len + 1):
            # test deletion
            var_ref_seq = r_ref_seq[r_var_pos:r_var_pos + indel_size + 1]
            var_alt_seq = r_ref_seq[r_var_pos]
            try:
                score = call_variant(
                    can_post,
                    post_mapped_start,
                    r_var_pos,
                    mapped_rl_cumsum,
                    r_to_q_poss,
                    var_ref_seq,
                    var_alt_seq,
                    context_bases,
                    all_paths,
                    np_ref_seq=np_ref_seq,
                )
            except mh.MegaError:
                # invalid reference or alternative sequence
                continue
            read_var_calls.append((True, score, var_ref_seq, var_alt_seq))

            # test random insertion
            var_ref_seq = r_ref_seq[r_var_pos]
            var_alt_seq = var_ref_seq + "".join(
                choice(CAN_BASES) for _ in range(indel_size))
            try:
                score = call_variant(
                    can_post,
                    post_mapped_start,
                    r_var_pos,
                    mapped_rl_cumsum,
                    r_to_q_poss,
                    var_ref_seq,
                    var_alt_seq,
                    context_bases,
                    all_paths,
                    np_ref_seq=np_ref_seq,
                )
            except mh.MegaError:
                # invalid reference or alternative sequence
                continue
            read_var_calls.append((True, score, var_ref_seq, var_alt_seq))

    return read_var_calls