示例#1
0
def open_pyguppy_backend(args):
    args.do_not_use_guppy_server = False
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            "Guppy logs output directory exists. Potentially overwriting guppy "
            "logs."
        )
    backend_params = backends.parse_backend_params(args)
    model_info = None
    try:
        model_info = backends.ModelInfo(backend_params, args.processes)
        # if spawning multiple workers run this inside newly spawned processes
        model_info.prep_model_worker()
        LOGGER.info(model_info.get_alphabet_str())
        LOGGER.info(
            "Model structure:\n\tStride: {}\n\tState size: {}".format(
                model_info.stride, model_info.output_size
            )
        )
        # use model_info.iter_basecalled_reads to basecall reads and return
        # relevant signal anchored information.
        model_info.client.disconnect()
    finally:
        # ensure guppy server is closed in finally block
        if model_info is not None:
            model_info.close()
示例#2
0
def _main(args):
    try:
        mh.mkdir(args.guppy_logs_output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            "Guppy logs output directory exists. Potentially overwriting " +
            "guppy logs.")
    logging.init_logger(args.guppy_logs_output_directory)
    # add required attributes for loading guppy, but not valid options for
    # this script.
    args.do_not_use_guppy_server = False
    args.output_directory = args.guppy_logs_output_directory
    args.outputs = [mh.PR_VAR_NAME]

    LOGGER.info("Loading model.")
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, args.processes) as model_info:
        LOGGER.info("Loading reference.")
        aligner = mappy.Aligner(str(args.reference),
                                preset=str("map-ont"),
                                best_n=1)

        process_all_reads(
            args.fast5s_dir,
            not args.not_recursive,
            args.num_reads,
            args.read_ids_filename,
            model_info,
            aligner,
            args.processes,
            args.output,
            args.suppress_progress,
            args.compute_false_reference_scores,
        )
def main():
    args = get_parser().parse_args()
    # add required attributes for loading guppy, but not valid options for
    # this script.
    args.do_not_use_guppy_server = False
    args.output_directory = args.guppy_logs_output_directory
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        sys.stderr.write(
            '***** WARNING ***** Guppy logs output directory exists. ' +
            'Potentially overwriting guppy logs.\n')

    sys.stderr.write('Loading model.\n')
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, args.processes) as model_info:
        sys.stderr.write('Loading reference.\n')
        aligner = mapping.alignerPlus(str(args.reference),
                                      preset=str('map-ont'),
                                      best_n=1)

        process_all_reads(args.fast5s_dir, args.num_reads,
                          args.read_ids_filename, model_info, aligner,
                          args.processes, args.output, args.suppress_progress,
                          args.compute_false_reference_scores)
示例#4
0
def _main(args):
    logging.init_logger(log_fn=args.log_filename, quiet=args.quiet)
    # add required attributes for loading guppy, but not valid options for
    # this script.
    args.do_not_use_guppy_server = False
    args.output_directory = args.guppy_logs_output_directory
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            "Guppy logs output directory exists. Potentially overwriting "
            + "guppy logs."
        )
    args = add_trim_guppy_none(args)
    args.outputs = [mh.PR_MOD_NAME]
    # make edge_buffer >= context_bases to simplify processing
    if args.edge_buffer < args.mod_context_bases:
        LOGGER.warning(
            "[--edge-buffer] less than [--mod-context-bases]. Setting "
            + "[--edge-buffer] to value from [--mod-context-bases]"
        )
        args.edge_buffer = args.mod_context_bases

    LOGGER.info("Loading model.")
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, args.processes) as model_info:
        check_map_sig_alphabet(model_info, args.mapped_signal_file)
        motifs = parse_motifs(args.motif, model_info, args.modified_bases_set)
        can_labs, mod_labs = extract_label_conversions(model_info)
        can_post_indices = model_info.can_indices.astype(np.uintp)
        all_mod_llrs, all_can_llrs = compute_diff_scores(
            args.mapped_signal_file,
            model_info,
            args.mod_context_bases,
            args.edge_buffer,
            args.num_reads,
            motifs,
            can_labs,
            mod_labs,
            can_post_indices,
        )

    mod_summary = [
        (
            mod,
            len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0,
            len(all_can_llrs[mod]) if mod in all_can_llrs else 0,
        )
        for mod in set(all_mod_llrs).union(all_can_llrs)
    ]
    LOGGER.info(
        "Data summary:\n\tmod\tmod_N\tcan_N\n"
        + "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary)
    )
    output_mods_data(all_mod_llrs, all_can_llrs, args.out_filename)
示例#5
0
def main():
    args = get_parser().parse_args()
    log_suffix = ('aggregation' if args.output_suffix is None else
                  'aggregation.' + args.output_suffix)
    logging.init_logger(args.output_directory, out_suffix=log_suffix)
    logger = logging.get_logger()

    mod_agg_info = mods.AGG_INFO(mods.BIN_THRESH_NAME,
                                 args.mod_binary_threshold)
    mod_names = []
    if mh.MOD_NAME in args.outputs:
        logger.info('Loading model.')
        mod_names = backends.ModelInfo(
            mh.get_model_fn(args.taiyaki_model_filename)).mod_long_names
    if args.reference is not None: logger.info('Loading reference.')
    aligner = mapping.alignerPlus(str(args.reference),
                                  preset=str('map-ont'),
                                  best_n=1)
    if args.reference is not None:
        aligner.add_ref_lens()
    valid_read_ids = None
    if args.read_ids_filename is not None:
        with open(args.read_ids_filename) as read_ids_fp:
            valid_read_ids = set(line.strip() for line in read_ids_fp)
    aggregate.aggregate_stats(
        args.outputs, args.output_directory, args.processes,
        args.write_vcf_log_probs, args.heterozygous_factors,
        snps.HAPLIOD_MODE if args.haploid else snps.DIPLOID_MODE, mod_names,
        mod_agg_info, args.write_mod_log_probs, args.mod_output_formats,
        args.suppress_progress, aligner.ref_names_and_lens, valid_read_ids,
        args.output_suffix)

    # note reference is required in order to annotate contigs for VCF writing
    if mh.SNP_NAME in args.outputs and args.reference is not None:
        logger.info('Sorting output variant file')
        variant_fn = mh.add_fn_suffix(
            mh.get_megalodon_fn(args.output_directory, mh.SNP_NAME),
            args.output_suffix)
        sort_variant_fn = mh.add_fn_suffix(variant_fn, 'sorted')
        snps.sort_variants(variant_fn, sort_variant_fn)
        logger.info('Indexing output variant file')
        index_var_fn = snps.index_variants(sort_variant_fn)

    return
def main():
    args = get_parser().parse_args()

    sys.stderr.write('Loading model.\n')
    model_info = backends.ModelInfo(args.taiyaki_model_filename, args.devices,
                                    args.processes, args.chunk_size,
                                    args.chunk_overlap,
                                    args.max_concurrent_chunks)
    sys.stderr.write('Loading reference.\n')
    aligner = mapping.alignerPlus(str(args.reference),
                                  preset=str('map-ont'),
                                  best_n=1)

    process_all_reads(args.fast5s_dir, args.num_reads, args.read_ids_filename,
                      model_info, aligner, args.processes, args.output,
                      args.suppress_progress,
                      args.compute_false_reference_scores)

    return
示例#7
0
def _main(args):
    logging.init_logger()
    # set args that are not relevant to alphabet
    args.devices = None

    # set guppy args
    args.guppy_server_port = None
    args.guppy_timeout = mh.DEFAULT_GUPPY_TIMEOUT
    args.output_directory = args.guppy_logs_output_directory

    # set taiyaki args
    args.chunk_size = 1000
    args.chunk_overlap = 100
    args.max_concurrent_chunks = 200
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            'Guppy logs output directory exists. Potentially overwriting ' +
            'guppy logs.')
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, 1) as model_info:
        if model_info.is_cat_mod:
            can_bs = [
                can_b for mod_b, _ in model_info.mod_long_names
                for can_b, can_mod_bs in model_info.can_base_mods.items()
                if mod_b in can_mod_bs
            ]
            LOGGER.info(
                ('Model contains canonical alphabet {} and modified ' +
                 'bases {}.').format(
                     model_info.can_alphabet,
                     '; '.join('{}={} (alt to {})'.format(mod_b, mln, can_b)
                               for (mod_b, mln), can_b in zip(
                                   model_info.mod_long_names, can_bs))))
        else:
            LOGGER.info('Model contains canonical alphabet {}.'.format(
                model_info.can_alphabet))
def _main(args):
    try:
        mh.mkdir(args.output_directory, False)
    except mh.MegaError:
        LOGGER.warning(
            'Guppy logs output directory exists. Potentially overwriting ' +
            'guppy logs.')
    logging.init_logger(args.log_directory)
    # set args that are not relevant to alphabet
    args.devices = None

    # set guppy args
    args.guppy_server_port = None
    args.guppy_timeout = mh.DEFAULT_GUPPY_TIMEOUT
    args.output_directory = args.guppy_logs_output_directory

    # set taiyaki args
    args.chunk_size = 1000
    args.chunk_overlap = 100
    args.max_concurrent_chunks = 200
    backend_params = backends.parse_backend_params(args)
    with backends.ModelInfo(backend_params, 1) as model_info:
        LOGGER.info(model_info.get_alphabet_str())
示例#9
0
def _main():
    args = get_parser().parse_args()

    mkdir(args.output_directory, args.overwrite)
    logging.init_logger(args.output_directory)
    logger = logging.get_logger()
    logger.debug('Command: """' + ' '.join(sys.argv) + '"""')

    if _DO_PROFILE:
        args = profile_validation(args)

    args, pr_ref_filts = parse_pr_ref_output(args)
    tai_model_fn = mh.get_model_fn(args.taiyaki_model_filename)
    model_info = backends.ModelInfo(tai_model_fn, args.devices, args.processes,
                                    args.chunk_size, args.chunk_overlap,
                                    args.max_concurrent_chunks)
    args, mods_info = mods_validation(args, model_info)
    aligner = aligner_validation(args)
    args, snps_data = snps_validation(args, model_info.is_cat_mod,
                                      model_info.output_size, aligner)

    process_all_reads(args.fast5s_dir, not args.not_recursive, args.num_reads,
                      args.read_ids_filename, model_info, args.outputs,
                      args.output_directory, args.basecalls_format, aligner,
                      snps_data, args.processes, args.verbose_read_progress,
                      args.suppress_progress, mods_info, args.database_safety,
                      args.edge_buffer, pr_ref_filts)

    if mh.MAP_NAME in args.outputs:
        logger.info('Spawning process to sort mappings')
        map_p = post_process_mapping(args.output_directory, aligner.out_fmt,
                                     aligner.ref_fn)

    if mh.WHATSHAP_MAP_NAME in args.outputs:
        logger.info('Spawning process to sort whatshap mappings')
        whatshap_sort_fn, whatshap_p = post_process_whatshap(
            args.output_directory, aligner.out_fmt, aligner.ref_fn)

    if mh.SNP_NAME in args.outputs or mh.MOD_NAME in args.outputs:
        post_process_aggregate(
            mods_info, args.outputs, args.mod_binary_threshold,
            args.output_directory, args.processes, args.write_vcf_log_probs,
            args.heterozygous_factors, snps_data, args.write_mod_log_probs,
            args.suppress_progress, aligner.ref_names_and_lens)

    if mh.SNP_NAME in args.outputs:
        logger.info('Sorting output variant file')
        variant_fn = mh.get_megalodon_fn(args.output_directory, mh.SNP_NAME)
        sort_variant_fn = mh.add_fn_suffix(variant_fn, 'sorted')
        snps.sort_variants(variant_fn, sort_variant_fn)
        logger.info('Indexing output variant file')
        index_variant_fn = snps.index_variants(sort_variant_fn)

    if mh.WHATSHAP_MAP_NAME in args.outputs:
        if whatshap_p.is_alive():
            logger.info('Waiting for whatshap mappings sort')
            while whatshap_p.is_alive():
                sleep(0.1)
        logger.info(
            snps.get_whatshap_command(index_variant_fn, whatshap_sort_fn,
                                      mh.add_fn_suffix(variant_fn, 'phased')))

    if mh.MAP_NAME in args.outputs:
        if map_p.is_alive():
            logger.info('Waiting for mappings sort')
            while map_p.is_alive():
                sleep(0.1)

    return