def process_all_reads(fast5s_dir, num_reads, read_ids_fn, model_info, aligner,
                      num_ps, out_fn, suppress_progress, do_false_ref):
    sys.stderr.write('Preparing workers and calling reads.\n')
    # read filename queue filler
    fast5_q = mp.Queue()
    num_reads_conn, getter_num_reads_conn = mp.Pipe()
    files_p = mp.Process(target=megalodon._fill_files_queue,
                         args=(fast5_q, fast5s_dir, num_reads, read_ids_fn,
                               True, num_ps, num_reads_conn),
                         daemon=True)
    files_p.start()

    snp_calls_q, snp_calls_p, main_sc_conn = mh.create_getter_q(
        _get_snp_calls, (out_fn, getter_num_reads_conn, suppress_progress))

    proc_reads_ps, map_conns = [], []
    for device in model_info.process_devices:
        if aligner is None:
            map_conn, caller_conn = None, None
        else:
            map_conn, caller_conn = mp.Pipe()
        map_conns.append(map_conn)
        p = mp.Process(target=_process_reads_worker,
                       args=(fast5_q, snp_calls_q, caller_conn, model_info,
                             device, do_false_ref))
        p.daemon = True
        p.start()
        proc_reads_ps.append(p)
    sleep(0.1)
    map_read_ts = []
    for map_conn in map_conns:
        t = threading.Thread(target=mapping._map_read_worker,
                             args=(aligner, map_conn, None))
        t.daemon = True
        t.start()
        map_read_ts.append(t)

    files_p.join()
    for proc_reads_p in proc_reads_ps:
        proc_reads_p.join()
    if map_read_ts is not None:
        for map_t in map_read_ts:
            map_t.join()
    if snp_calls_p.is_alive():
        main_sc_conn.send(True)
        snp_calls_p.join()

    return
示例#2
0
def aggregate_stats(outputs,
                    out_dir,
                    num_ps,
                    write_vcf_lp,
                    het_factors,
                    call_mode,
                    mod_names,
                    mod_agg_info,
                    write_mod_lp,
                    mod_output_fmts,
                    suppress_progress,
                    ref_names_and_lens,
                    valid_read_ids=None,
                    out_suffix=None):
    if mh.SNP_NAME in outputs and mh.MOD_NAME in outputs:
        num_ps = max(num_ps // 2, 1)

    logger = logging.get_logger('agg')
    num_snps, num_mods, snp_prog_q, mod_prog_q = (0, 0, queue.Queue(),
                                                  queue.Queue())
    if mh.SNP_NAME in outputs:
        snps_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_SNP_NAME)
        logger.info('Computing number of unique variants.')
        num_snps = snps.AggSnps(snps_db_fn).num_uniq()
        logger.info('Spawning variant aggregation processes.')
        # create process to collect snp stats from workers
        snp_stats_q, snp_stats_p, main_snp_stats_conn = mh.create_getter_q(
            _get_snp_stats_queue,
            (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp))
        # create process to fill snp locs queue
        snp_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        snp_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(snp_filler_q, snps_db_fn, snps.AggSnps,
                                        num_ps),
                                  daemon=True)
        snp_filler_p.start()
        # create worker processes to aggregate snps
        snp_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_snps_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_snps_worker,
                           args=(snp_filler_q, snp_stats_q, snp_prog_q,
                                 snps_db_fn, write_vcf_lp, het_factors,
                                 call_mode, valid_read_ids),
                           daemon=True)
            p.start()
            agg_snps_ps.append(p)

    if mh.MOD_NAME in outputs:
        mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME)
        num_mods = mods.AggMods(mods_db_fn).num_uniq()
        logger.info('Spawning modified base aggregation processes.')
        # create process to collect mods stats from workers
        mod_stats_q, mod_stats_p, main_mod_stats_conn = mh.create_getter_q(
            _get_mod_stats_queue, (out_dir, mod_names, ref_names_and_lens,
                                   out_suffix, write_mod_lp, mod_output_fmts))
        # create process to fill mod locs queue
        mod_filler_q = mp.Queue(maxsize=100000)
        mod_fill_limit = _N_MOD_PROF if _DO_PROF else None
        mod_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(mod_filler_q, mods_db_fn, mods.AggMods,
                                        num_ps, mod_fill_limit),
                                  daemon=True)
        mod_filler_p.start()
        # create worker processes to aggregate mods
        mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_mods_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_mods_worker,
                           args=(mod_filler_q, mod_stats_q, mod_prog_q,
                                 mods_db_fn, mod_agg_info, valid_read_ids,
                                 write_mod_lp),
                           daemon=True)
            p.start()
            agg_mods_ps.append(p)

    # create progress process
    logger.info('Aggregating {} SNPs and {} mod sites over reads.'.format(
        num_snps, num_mods))
    main_prog_conn, prog_conn = mp.Pipe()
    prog_p = mp.Process(target=_agg_prog_worker,
                        args=(snp_prog_q, mod_prog_q, num_snps, num_mods,
                              prog_conn, suppress_progress),
                        daemon=True)
    prog_p.start()

    # join filler processes first
    if mh.SNP_NAME in outputs:
        snp_filler_p.join()
        for agg_snps_p in agg_snps_ps:
            agg_snps_p.join()
        # send to conn
        if snp_stats_p.is_alive():
            main_snp_stats_conn.send(True)
        snp_stats_p.join()
    if mh.MOD_NAME in outputs:
        for agg_mods_p in agg_mods_ps:
            agg_mods_p.join()
        if mod_stats_p.is_alive():
            main_mod_stats_conn.send(True)
        mod_stats_p.join()
    if prog_p.is_alive():
        main_prog_conn.send(True)
        prog_p.join()

    return
示例#3
0
def process_all_reads(fast5s_dir, recursive, num_reads, read_ids_fn,
                      model_info, outputs, out_dir, bc_fmt, aligner, snps_data,
                      num_ps, num_update_errors, suppress_progress, mods_info,
                      db_safety, edge_buffer, pr_ref_filts):
    logger = logging.get_logger()
    logger.info('Preparing workers to process reads.')
    # read filename queue filler
    # Note no maxsize for this queue to compute total number of reads while
    # also not delaying read processing
    read_file_q = mp.Queue()
    num_reads_conn, getter_num_reads_conn = mp.Pipe()
    files_p = mp.Process(target=_fill_files_queue,
                         args=(read_file_q, fast5s_dir, num_reads, read_ids_fn,
                               recursive, num_ps, num_reads_conn),
                         daemon=True)
    files_p.start()
    # progress and failed reads getter (no limit on failed reads queue
    # in case error occurs there, don't halt run
    failed_reads_q, f_p, main_f_conn = mh.create_getter_q(
        _get_fail_queue,
        (getter_num_reads_conn, num_update_errors, suppress_progress),
        max_size=None)

    # start output type getters/writers
    (bc_q, bc_p, main_bc_conn, mo_q, mo_p, main_mo_conn, snps_q, snps_p,
     main_snps_conn, mods_q, mods_p, main_mods_conn) = [
         None,
     ] * 12
    if mh.BC_NAME in outputs or mh.BC_MODS_NAME in outputs:
        if mh.BC_NAME not in outputs:
            outputs.append(mh.BC_NAME)
        bc_q, bc_p, main_bc_conn = mh.create_getter_q(
            _get_bc_queue, (out_dir, bc_fmt, mods_info.do_output_mods,
                            mods_info.mod_long_names))
    if mh.MAP_NAME in outputs:
        do_output_pr_refs = (mh.PR_REF_NAME in outputs
                             and not mods_info.do_pr_ref_mods
                             and not snps_data.do_pr_ref_snps)
        mo_q, mo_p, main_mo_conn = mh.create_getter_q(
            mapping._get_map_queue,
            (out_dir, aligner.ref_names_and_lens, aligner.out_fmt,
             aligner.ref_fn, do_output_pr_refs, pr_ref_filts))
    if mh.PR_SNP_NAME in outputs:
        pr_refs_fn = mh.get_megalodon_fn(out_dir, mh.PR_REF_NAME) if (
            mh.PR_REF_NAME in outputs and snps_data.do_pr_ref_snps) else None
        whatshap_map_fn = (
            mh.get_megalodon_fn(out_dir, mh.WHATSHAP_MAP_NAME) + '.' +
            aligner.out_fmt) if mh.WHATSHAP_MAP_NAME in outputs else None
        snps_txt_fn = (mh.get_megalodon_fn(out_dir, mh.PR_SNP_TXT_NAME)
                       if snps_data.write_snps_txt else None)
        snps_q, snps_p, main_snps_conn = mh.create_getter_q(
            snps._get_snps_queue,
            (mh.get_megalodon_fn(out_dir, mh.PR_SNP_NAME), snps_txt_fn,
             db_safety, pr_refs_fn, pr_ref_filts, whatshap_map_fn,
             aligner.ref_names_and_lens, aligner.ref_fn))
    if mh.PR_MOD_NAME in outputs:
        pr_refs_fn = mh.get_megalodon_fn(out_dir, mh.PR_REF_NAME) if (
            mh.PR_REF_NAME in outputs and mods_info.do_pr_ref_mods) else None
        mods_txt_fn = (mh.get_megalodon_fn(out_dir, mh.PR_MOD_TXT_NAME)
                       if mods_info.write_mods_txt else None)
        mods_q, mods_p, main_mods_conn = mh.create_getter_q(
            mods._get_mods_queue,
            (mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME), mods_txt_fn,
             db_safety, pr_refs_fn, pr_ref_filts))

    proc_reads_ps, map_conns = [], []
    for device in model_info.process_devices:
        if aligner is None:
            map_conn, caller_conn = None, None
        else:
            map_conn, caller_conn = mp.Pipe()
        map_conns.append(map_conn)
        p = mp.Process(target=_process_reads_worker,
                       args=(read_file_q, bc_q, snps_q, failed_reads_q, mods_q,
                             caller_conn, model_info, snps_data, mods_info,
                             edge_buffer, device))
        p.daemon = True
        p.start()
        proc_reads_ps.append(p)
    sleep(0.1)

    # perform mapping in threads for mappy shared memory interface
    # open threads after all processes have started due to python
    # multiprocess combined with threading instability
    if aligner is None:
        map_read_ts = None
    else:
        map_read_ts = []
        for map_conn in map_conns:
            t = threading.Thread(target=mapping._map_read_worker,
                                 args=(aligner, map_conn, mo_q))
            t.daemon = True
            t.start()
            map_read_ts.append(t)

    try:
        files_p.join()
        for proc_reads_p in proc_reads_ps:
            proc_reads_p.join()
        if map_read_ts is not None:
            for map_t in map_read_ts:
                map_t.join()
        # comm to getter processes to return
        if f_p.is_alive():
            main_f_conn.send(True)
            f_p.join()
        for on, p, main_conn in ((mh.BC_NAME, bc_p, main_bc_conn),
                                 (mh.MAP_NAME, mo_p, main_mo_conn),
                                 (mh.PR_SNP_NAME, snps_p, main_snps_conn),
                                 (mh.PR_MOD_NAME, mods_p, main_mods_conn)):
            if on in outputs and p.is_alive():
                main_conn.send(True)
                if on == mh.PR_SNP_NAME:
                    logger.info(
                        'Waiting for snps database to complete indexing.')
                elif on == mh.PR_MOD_NAME:
                    logger.info(
                        'Waiting for mods database to complete indexing.')
                p.join()
    except KeyboardInterrupt:
        logger.error('Exiting due to keyboard interrupt.')
        sys.exit(1)

    return
示例#4
0
def aggregate_stats(outputs,
                    out_dir,
                    num_ps,
                    write_vcf_lp,
                    het_factors,
                    call_mode,
                    mod_agg_info,
                    write_mod_lp,
                    mod_output_fmts,
                    suppress_progress,
                    valid_read_ids=None,
                    out_suffix=None):
    if mh.VAR_NAME in outputs and mh.MOD_NAME in outputs:
        num_ps = max(num_ps // 2, 1)

    num_vars, num_mods, var_prog_q, mod_prog_q = (0, 0, queue.Queue(),
                                                  queue.Queue())
    if mh.VAR_NAME in outputs:
        vars_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_VAR_NAME)
        agg_vars = variants.AggVars(vars_db_fn, load_in_mem_indices=False)
        num_vars = agg_vars.num_uniq()
        ref_names_and_lens = agg_vars.vars_db.get_all_chrm_and_lens()
        agg_vars.close()
        LOGGER.info('Spawning variant aggregation processes.')
        # create process to collect var stats from workers
        var_stats_q, var_stats_p, main_var_stats_conn = mh.create_getter_q(
            _get_var_stats_queue,
            (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp))
        # create process to fill variant locs queue
        var_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        var_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(var_filler_q, vars_db_fn,
                                        variants.AggVars, num_ps),
                                  daemon=True)
        var_filler_p.start()
        # create worker processes to aggregate variants
        var_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_vars_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_vars_worker,
                           args=(var_filler_q, var_stats_q, var_prog_q,
                                 vars_db_fn, write_vcf_lp, het_factors,
                                 call_mode, valid_read_ids),
                           daemon=True)
            p.start()
            agg_vars_ps.append(p)

    if mh.MOD_NAME in outputs:
        mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME)
        agg_mods = mods.AggMods(mods_db_fn, load_in_mem_indices=False)
        mod_long_names = agg_mods.get_mod_long_names()
        num_mods = agg_mods.num_uniq()
        ref_names_and_lens = agg_mods.mods_db.get_all_chrm_and_lens()
        agg_mods.close()
        LOGGER.info('Spawning modified base aggregation processes.')
        # create process to collect mods stats from workers
        mod_stats_q, mod_stats_p, main_mod_stats_conn = mh.create_getter_q(
            _get_mod_stats_queue, (out_dir, mod_long_names, ref_names_and_lens,
                                   out_suffix, write_mod_lp, mod_output_fmts))
        # create process to fill mod locs queue
        mod_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        mod_fill_limit = _N_MOD_PROF if _DO_PROF else None
        mod_filler_p = mp.Process(target=_fill_locs_queue,
                                  args=(mod_filler_q, mods_db_fn, mods.AggMods,
                                        num_ps, mod_fill_limit),
                                  daemon=True)
        mod_filler_p.start()
        # create worker processes to aggregate mods
        mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE)
        agg_mods_ps = []
        for _ in range(num_ps):
            p = mp.Process(target=_agg_mods_worker,
                           args=(mod_filler_q, mod_stats_q, mod_prog_q,
                                 mods_db_fn, mod_agg_info, valid_read_ids,
                                 write_mod_lp),
                           daemon=True)
            p.start()
            agg_mods_ps.append(p)

    # create progress process
    LOGGER.info(
        ('Aggregating {} variants and {} modified base sites over reads.\n' +
         '\t\tNOTE: If this step is very slow, ensure the output directory ' +
         'is located on a fast read disk (e.g. local SSD). Aggregation can ' +
         'be restarted using the megalodon/scripts/run_aggregation.py ' +
         'script.').format(num_vars, num_mods))
    main_prog_conn, prog_conn = mp.Pipe()
    prog_p = mp.Process(target=_agg_prog_worker,
                        args=(var_prog_q, mod_prog_q, num_vars, num_mods,
                              prog_conn, suppress_progress),
                        daemon=True)
    prog_p.start()

    # join filler processes first
    if mh.VAR_NAME in outputs:
        var_filler_p.join()
        for agg_vars_p in agg_vars_ps:
            agg_vars_p.join()
        # send to conn
        if var_stats_p.is_alive():
            main_var_stats_conn.send(True)
        var_stats_p.join()
    if mh.MOD_NAME in outputs:
        for agg_mods_p in agg_mods_ps:
            agg_mods_p.join()
        if mod_stats_p.is_alive():
            main_mod_stats_conn.send(True)
        mod_stats_p.join()
    if prog_p.is_alive():
        main_prog_conn.send(True)
        prog_p.join()

    return