Пример #1
0
def _main(args):
    log_suffix = ('aggregation' if args.output_suffix is None else
                  'aggregation.' + args.output_suffix)
    logging.init_logger(args.megalodon_directory, out_suffix=log_suffix)
    LOGGER.debug('Command: """' + ' '.join(sys.argv) + '"""')

    if args.mod_aggregate_method == mh.MOD_EM_NAME:
        mod_agg_info = mods.AGG_INFO(mh.MOD_EM_NAME, None)
    elif args.mod_aggregate_method == mh.MOD_BIN_THRESH_NAME:
        mod_agg_info = mods.AGG_INFO(
            mh.MOD_BIN_THRESH_NAME, args.mod_binary_threshold)
    valid_read_ids = mh.parse_read_ids(args.read_ids_filename)
    aggregate.aggregate_stats(
        args.outputs, args.megalodon_directory, args.processes,
        args.write_vcf_log_probs, args.heterozygous_factors,
        variants.HAPLIOD_MODE if args.haploid else variants.DIPLOID_MODE,
        mod_agg_info, args.write_mod_log_probs, args.mod_output_formats,
        args.suppress_progress, valid_read_ids, args.output_suffix,
        args.aggregate_batch_size)

    if mh.VAR_NAME in args.outputs:
        LOGGER.info('Sorting output variant file')
        variant_fn = mh.add_fn_suffix(
            mh.get_megalodon_fn(args.megalodon_directory, mh.VAR_NAME),
            args.output_suffix)
        sort_variant_fn = mh.add_fn_suffix(variant_fn, 'sorted')
        variants.sort_variants(variant_fn, sort_variant_fn)
        LOGGER.info('Indexing output variant file')
        variants.index_variants(sort_variant_fn)
Пример #2
0
def main():
    args = get_parser().parse_args()

    vars0_idx = pysam.VariantFile(args.diploid_called_variants)
    vars1_idx = pysam.VariantFile(args.haplotype1_variants)
    vars2_idx = pysam.VariantFile(args.haplotype2_variants)
    try:
        contigs0 = list(vars0_idx.header.contigs.keys())
        vars0_idx.fetch(next(iter(contigs0)), 0, 0)
        contigs1 = list(vars1_idx.header.contigs.keys())
        vars1_idx.fetch(next(iter(contigs1)), 0, 0)
        contigs2 = list(vars2_idx.header.contigs.keys())
        vars2_idx.fetch(next(iter(contigs2)), 0, 0)
    except ValueError:
        raise mh.MegaError(
            'Variants file must be indexed. Use bgzip and tabix.')

    out_vars = open(args.out_vcf, 'w')
    out_vars.write(
        HEADER.format('\n'.join(
            (CONTIG_HEADER_LINE.format(ctg.name, ctg.length)
             for ctg in vars0_idx.header.contigs.values()))))
    for contig in set(contigs0).intersection(contigs1).intersection(contigs2):
        for curr_v0_rec, curr_v1_rec, curr_v2_rec in iter_contig_vars(
                iter(vars0_idx.fetch(contig)), iter(vars1_idx.fetch(contig)),
                iter(vars2_idx.fetch(contig))):
            if curr_v0_rec is None:
                continue
            write_var(curr_v0_rec, curr_v1_rec, curr_v2_rec, out_vars, contig)

    out_vars.close()

    variants.index_variants(args.out_vcf)
Пример #3
0
def _main(args):
    logging.init_logger()
    LOGGER.info("Loading reference")
    aligner = mappy.Aligner(str(args.reference),
                            preset=str("map-ont"),
                            best_n=1)
    LOGGER.info("Loading variants")
    var_data = variants.VarInfo(args.in_vcf,
                                aligner,
                                args.max_indel_size,
                                keep_var_fp_open=True)
    contigs = var_data.variants_idx.header.contigs.values()
    LOGGER.info("Atomizing variants")
    with open(args.out_vcf, "w") as out_vars:
        # preprocess contigs to set contig lengths for VCF header
        ctg_lens = {}
        for ctg in contigs:
            chrm_seq = aligner.seq(ctg.name)
            if len(chrm_seq) != ctg.length:
                LOGGER.warning(
                    ("Mismatched contig lengths ({}) between " +
                     "reference ({}) and input VCF ({}) using length from "
                     "reference").format(ctg.name, len(chrm_seq), ctg.length))
            ctg_lens[ctg.name] = len(chrm_seq)

        out_vars.write("\n".join(HEADER + [
            CONTIG_HEADER_LINE.format(ctg, ctg_len)
            for ctg, ctg_len in ctg_lens.items()
        ] + [
            variants.CONTEXT_BASE_MI_LINE,
            COMMAND_HEADER_LINE.format(" ".join(sys.argv)),
            FIELDS_LINE,
        ]) + "\n")
        for ctg in contigs:
            chrm_seq = aligner.seq(ctg.name)
            map_pos = mapping.MAP_POS(
                chrm=ctg.name,
                strand=None,
                start=0,
                end=len(chrm_seq),
                q_trim_start=None,
                q_trim_end=None,
            )
            for var in var_data.fetch_read_variants(map_pos,
                                                    mh.seq_to_int(chrm_seq)):
                out_vars.write(
                    RECORD_LINE.format(
                        chrm=ctg.name,
                        pos=var.ref_start + 1,
                        rid=var.id,
                        ref=var.ref,
                        alts=",".join(var.alts),
                        info=variants.HAS_CONTEXT_BASE_TAG
                        if var.has_context_base else ".",
                    ))

    LOGGER.info("Indexing output variant file")
    variants.index_variants(args.out_vcf)
def _main(args):
    logging.init_logger()
    LOGGER.info("Opening VCF files.")
    source_vars = pysam.VariantFile(args.diploid_called_variants)
    h1_vars = pysam.VariantFile(args.haplotype1_variants)
    h2_vars = pysam.VariantFile(args.haplotype2_variants)
    try:
        contigs0 = list(source_vars.header.contigs.keys())
        source_vars.fetch(contigs0[0])
        h1_vars.fetch(next(iter(h1_vars.header.contigs.keys())))
        h2_vars.fetch(next(iter(h2_vars.header.contigs.keys())))
    except ValueError:
        raise mh.MegaError(
            "Variant files must be indexed. Use bgzip and tabix.")

    LOGGER.info("Processing variants.")
    out_vars = open(args.out_vcf, "w")
    out_vars.write(
        HEADER.format("\n".join(
            (CONTIG_HEADER_LINE.format(ctg.name, ctg.length)
             for ctg in source_vars.header.contigs.values()))))
    bar = tqdm(
        total=len(contigs0),
        smoothing=0,
        unit=" contigs",
        dynamic_ncols=True,
        desc="Variant Processing",
        mininterval=0,
    )
    for contig in contigs0:
        for curr_s_rec, curr_h1_rec, curr_h2_rec in tqdm(
                iter_contig_vars(
                    get_contig_iter(source_vars, contig),
                    get_contig_iter(h1_vars, contig),
                    get_contig_iter(h2_vars, contig),
                    contig,
                    bar,
                    args.force_invalid_variant_processing,
                ),
                smoothing=0,
                unit=" variants",
                dynamic_ncols=True,
                leave=False,
                desc="{} Variants".format(contig),
        ):
            write_var(curr_s_rec, curr_h1_rec, curr_h2_rec, out_vars, contig)
        bar.update(1)

    out_vars.close()

    variants.index_variants(args.out_vcf)
Пример #5
0
def _main(args):
    logging.init_logger()
    LOGGER.info('Loading reference')
    aligner = mapping.alignerPlus(str(args.reference),
                                  preset=str('map-ont'),
                                  best_n=1)
    aligner.add_ref_lens()
    LOGGER.info('Loading variants')
    var_data = variants.VarData(args.in_vcf,
                                args.max_indel_size,
                                keep_var_fp_open=True,
                                aligner=aligner)
    contigs = var_data.variants_idx.header.contigs.values()
    LOGGER.info('Atomizing variants')
    with open(args.out_vcf, 'w') as out_vars:
        out_vars.write('\n'.join(HEADER + [
            CONTIG_HEADER_LINE.format(ctg.name, ctg.length) for ctg in contigs
        ] + [
            variants.CONTEXT_BASE_MI_LINE,
            COMMAND_HEADER_LINE.format(' '.join(sys.argv)), FIELDS_LINE
        ]) + '\n')
        for ctg in contigs:
            chrm_seq = aligner.seq(ctg.name)
            if len(chrm_seq) != ctg.length:
                LOGGER.warning(('Mismatched contig lengths ({}) between ' +
                                'reference ({}) and input VCF ({})').format(
                                    ctg.name, len(chrm_seq), ctg.length))
            map_pos = mapping.MAP_POS(chrm=ctg.name,
                                      strand=None,
                                      start=0,
                                      end=len(chrm_seq),
                                      q_trim_start=None,
                                      q_trim_end=None)
            for var in var_data.fetch_read_variants(map_pos,
                                                    mh.seq_to_int(chrm_seq)):
                out_vars.write(
                    RECORD_LINE.format(chrm=ctg.name,
                                       pos=var.ref_start + 1,
                                       rid=var.id,
                                       ref=var.ref,
                                       alts=','.join(var.alts),
                                       info=variants.HAS_CONTEXT_BASE_TAG
                                       if var.has_context_base else '.'))

    LOGGER.info('Indexing output variant file')
    variants.index_variants(args.out_vcf)