def process_all_reads(fast5s_dir, num_reads, read_ids_fn, model_info, aligner, num_ps, out_fn, suppress_progress, do_false_ref): sys.stderr.write('Preparing workers and calling reads.\n') # read filename queue filler fast5_q = mp.Queue() num_reads_conn, getter_num_reads_conn = mp.Pipe() files_p = mp.Process(target=megalodon._fill_files_queue, args=(fast5_q, fast5s_dir, num_reads, read_ids_fn, True, num_ps, num_reads_conn), daemon=True) files_p.start() snp_calls_q, snp_calls_p, main_sc_conn = mh.create_getter_q( _get_snp_calls, (out_fn, getter_num_reads_conn, suppress_progress)) proc_reads_ps, map_conns = [], [] for device in model_info.process_devices: if aligner is None: map_conn, caller_conn = None, None else: map_conn, caller_conn = mp.Pipe() map_conns.append(map_conn) p = mp.Process(target=_process_reads_worker, args=(fast5_q, snp_calls_q, caller_conn, model_info, device, do_false_ref)) p.daemon = True p.start() proc_reads_ps.append(p) sleep(0.1) map_read_ts = [] for map_conn in map_conns: t = threading.Thread(target=mapping._map_read_worker, args=(aligner, map_conn, None)) t.daemon = True t.start() map_read_ts.append(t) files_p.join() for proc_reads_p in proc_reads_ps: proc_reads_p.join() if map_read_ts is not None: for map_t in map_read_ts: map_t.join() if snp_calls_p.is_alive(): main_sc_conn.send(True) snp_calls_p.join() return
def aggregate_stats(outputs, out_dir, num_ps, write_vcf_lp, het_factors, call_mode, mod_names, mod_agg_info, write_mod_lp, mod_output_fmts, suppress_progress, ref_names_and_lens, valid_read_ids=None, out_suffix=None): if mh.SNP_NAME in outputs and mh.MOD_NAME in outputs: num_ps = max(num_ps // 2, 1) logger = logging.get_logger('agg') num_snps, num_mods, snp_prog_q, mod_prog_q = (0, 0, queue.Queue(), queue.Queue()) if mh.SNP_NAME in outputs: snps_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_SNP_NAME) logger.info('Computing number of unique variants.') num_snps = snps.AggSnps(snps_db_fn).num_uniq() logger.info('Spawning variant aggregation processes.') # create process to collect snp stats from workers snp_stats_q, snp_stats_p, main_snp_stats_conn = mh.create_getter_q( _get_snp_stats_queue, (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp)) # create process to fill snp locs queue snp_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) snp_filler_p = mp.Process(target=_fill_locs_queue, args=(snp_filler_q, snps_db_fn, snps.AggSnps, num_ps), daemon=True) snp_filler_p.start() # create worker processes to aggregate snps snp_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_snps_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_snps_worker, args=(snp_filler_q, snp_stats_q, snp_prog_q, snps_db_fn, write_vcf_lp, het_factors, call_mode, valid_read_ids), daemon=True) p.start() agg_snps_ps.append(p) if mh.MOD_NAME in outputs: mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME) num_mods = mods.AggMods(mods_db_fn).num_uniq() logger.info('Spawning modified base aggregation processes.') # create process to collect mods stats from workers mod_stats_q, mod_stats_p, main_mod_stats_conn = mh.create_getter_q( _get_mod_stats_queue, (out_dir, mod_names, ref_names_and_lens, out_suffix, write_mod_lp, mod_output_fmts)) # create process to fill mod locs queue mod_filler_q = mp.Queue(maxsize=100000) mod_fill_limit = _N_MOD_PROF if _DO_PROF else None mod_filler_p = mp.Process(target=_fill_locs_queue, args=(mod_filler_q, mods_db_fn, mods.AggMods, num_ps, mod_fill_limit), daemon=True) mod_filler_p.start() # create worker processes to aggregate mods mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_mods_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_mods_worker, args=(mod_filler_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_ids, write_mod_lp), daemon=True) p.start() agg_mods_ps.append(p) # create progress process logger.info('Aggregating {} SNPs and {} mod sites over reads.'.format( num_snps, num_mods)) main_prog_conn, prog_conn = mp.Pipe() prog_p = mp.Process(target=_agg_prog_worker, args=(snp_prog_q, mod_prog_q, num_snps, num_mods, prog_conn, suppress_progress), daemon=True) prog_p.start() # join filler processes first if mh.SNP_NAME in outputs: snp_filler_p.join() for agg_snps_p in agg_snps_ps: agg_snps_p.join() # send to conn if snp_stats_p.is_alive(): main_snp_stats_conn.send(True) snp_stats_p.join() if mh.MOD_NAME in outputs: for agg_mods_p in agg_mods_ps: agg_mods_p.join() if mod_stats_p.is_alive(): main_mod_stats_conn.send(True) mod_stats_p.join() if prog_p.is_alive(): main_prog_conn.send(True) prog_p.join() return
def process_all_reads(fast5s_dir, recursive, num_reads, read_ids_fn, model_info, outputs, out_dir, bc_fmt, aligner, snps_data, num_ps, num_update_errors, suppress_progress, mods_info, db_safety, edge_buffer, pr_ref_filts): logger = logging.get_logger() logger.info('Preparing workers to process reads.') # read filename queue filler # Note no maxsize for this queue to compute total number of reads while # also not delaying read processing read_file_q = mp.Queue() num_reads_conn, getter_num_reads_conn = mp.Pipe() files_p = mp.Process(target=_fill_files_queue, args=(read_file_q, fast5s_dir, num_reads, read_ids_fn, recursive, num_ps, num_reads_conn), daemon=True) files_p.start() # progress and failed reads getter (no limit on failed reads queue # in case error occurs there, don't halt run failed_reads_q, f_p, main_f_conn = mh.create_getter_q( _get_fail_queue, (getter_num_reads_conn, num_update_errors, suppress_progress), max_size=None) # start output type getters/writers (bc_q, bc_p, main_bc_conn, mo_q, mo_p, main_mo_conn, snps_q, snps_p, main_snps_conn, mods_q, mods_p, main_mods_conn) = [ None, ] * 12 if mh.BC_NAME in outputs or mh.BC_MODS_NAME in outputs: if mh.BC_NAME not in outputs: outputs.append(mh.BC_NAME) bc_q, bc_p, main_bc_conn = mh.create_getter_q( _get_bc_queue, (out_dir, bc_fmt, mods_info.do_output_mods, mods_info.mod_long_names)) if mh.MAP_NAME in outputs: do_output_pr_refs = (mh.PR_REF_NAME in outputs and not mods_info.do_pr_ref_mods and not snps_data.do_pr_ref_snps) mo_q, mo_p, main_mo_conn = mh.create_getter_q( mapping._get_map_queue, (out_dir, aligner.ref_names_and_lens, aligner.out_fmt, aligner.ref_fn, do_output_pr_refs, pr_ref_filts)) if mh.PR_SNP_NAME in outputs: pr_refs_fn = mh.get_megalodon_fn(out_dir, mh.PR_REF_NAME) if ( mh.PR_REF_NAME in outputs and snps_data.do_pr_ref_snps) else None whatshap_map_fn = ( mh.get_megalodon_fn(out_dir, mh.WHATSHAP_MAP_NAME) + '.' + aligner.out_fmt) if mh.WHATSHAP_MAP_NAME in outputs else None snps_txt_fn = (mh.get_megalodon_fn(out_dir, mh.PR_SNP_TXT_NAME) if snps_data.write_snps_txt else None) snps_q, snps_p, main_snps_conn = mh.create_getter_q( snps._get_snps_queue, (mh.get_megalodon_fn(out_dir, mh.PR_SNP_NAME), snps_txt_fn, db_safety, pr_refs_fn, pr_ref_filts, whatshap_map_fn, aligner.ref_names_and_lens, aligner.ref_fn)) if mh.PR_MOD_NAME in outputs: pr_refs_fn = mh.get_megalodon_fn(out_dir, mh.PR_REF_NAME) if ( mh.PR_REF_NAME in outputs and mods_info.do_pr_ref_mods) else None mods_txt_fn = (mh.get_megalodon_fn(out_dir, mh.PR_MOD_TXT_NAME) if mods_info.write_mods_txt else None) mods_q, mods_p, main_mods_conn = mh.create_getter_q( mods._get_mods_queue, (mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME), mods_txt_fn, db_safety, pr_refs_fn, pr_ref_filts)) proc_reads_ps, map_conns = [], [] for device in model_info.process_devices: if aligner is None: map_conn, caller_conn = None, None else: map_conn, caller_conn = mp.Pipe() map_conns.append(map_conn) p = mp.Process(target=_process_reads_worker, args=(read_file_q, bc_q, snps_q, failed_reads_q, mods_q, caller_conn, model_info, snps_data, mods_info, edge_buffer, device)) p.daemon = True p.start() proc_reads_ps.append(p) sleep(0.1) # perform mapping in threads for mappy shared memory interface # open threads after all processes have started due to python # multiprocess combined with threading instability if aligner is None: map_read_ts = None else: map_read_ts = [] for map_conn in map_conns: t = threading.Thread(target=mapping._map_read_worker, args=(aligner, map_conn, mo_q)) t.daemon = True t.start() map_read_ts.append(t) try: files_p.join() for proc_reads_p in proc_reads_ps: proc_reads_p.join() if map_read_ts is not None: for map_t in map_read_ts: map_t.join() # comm to getter processes to return if f_p.is_alive(): main_f_conn.send(True) f_p.join() for on, p, main_conn in ((mh.BC_NAME, bc_p, main_bc_conn), (mh.MAP_NAME, mo_p, main_mo_conn), (mh.PR_SNP_NAME, snps_p, main_snps_conn), (mh.PR_MOD_NAME, mods_p, main_mods_conn)): if on in outputs and p.is_alive(): main_conn.send(True) if on == mh.PR_SNP_NAME: logger.info( 'Waiting for snps database to complete indexing.') elif on == mh.PR_MOD_NAME: logger.info( 'Waiting for mods database to complete indexing.') p.join() except KeyboardInterrupt: logger.error('Exiting due to keyboard interrupt.') sys.exit(1) return
def aggregate_stats(outputs, out_dir, num_ps, write_vcf_lp, het_factors, call_mode, mod_agg_info, write_mod_lp, mod_output_fmts, suppress_progress, valid_read_ids=None, out_suffix=None): if mh.VAR_NAME in outputs and mh.MOD_NAME in outputs: num_ps = max(num_ps // 2, 1) num_vars, num_mods, var_prog_q, mod_prog_q = (0, 0, queue.Queue(), queue.Queue()) if mh.VAR_NAME in outputs: vars_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_VAR_NAME) agg_vars = variants.AggVars(vars_db_fn, load_in_mem_indices=False) num_vars = agg_vars.num_uniq() ref_names_and_lens = agg_vars.vars_db.get_all_chrm_and_lens() agg_vars.close() LOGGER.info('Spawning variant aggregation processes.') # create process to collect var stats from workers var_stats_q, var_stats_p, main_var_stats_conn = mh.create_getter_q( _get_var_stats_queue, (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp)) # create process to fill variant locs queue var_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) var_filler_p = mp.Process(target=_fill_locs_queue, args=(var_filler_q, vars_db_fn, variants.AggVars, num_ps), daemon=True) var_filler_p.start() # create worker processes to aggregate variants var_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_vars_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_vars_worker, args=(var_filler_q, var_stats_q, var_prog_q, vars_db_fn, write_vcf_lp, het_factors, call_mode, valid_read_ids), daemon=True) p.start() agg_vars_ps.append(p) if mh.MOD_NAME in outputs: mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME) agg_mods = mods.AggMods(mods_db_fn, load_in_mem_indices=False) mod_long_names = agg_mods.get_mod_long_names() num_mods = agg_mods.num_uniq() ref_names_and_lens = agg_mods.mods_db.get_all_chrm_and_lens() agg_mods.close() LOGGER.info('Spawning modified base aggregation processes.') # create process to collect mods stats from workers mod_stats_q, mod_stats_p, main_mod_stats_conn = mh.create_getter_q( _get_mod_stats_queue, (out_dir, mod_long_names, ref_names_and_lens, out_suffix, write_mod_lp, mod_output_fmts)) # create process to fill mod locs queue mod_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) mod_fill_limit = _N_MOD_PROF if _DO_PROF else None mod_filler_p = mp.Process(target=_fill_locs_queue, args=(mod_filler_q, mods_db_fn, mods.AggMods, num_ps, mod_fill_limit), daemon=True) mod_filler_p.start() # create worker processes to aggregate mods mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_mods_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_mods_worker, args=(mod_filler_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_ids, write_mod_lp), daemon=True) p.start() agg_mods_ps.append(p) # create progress process LOGGER.info( ('Aggregating {} variants and {} modified base sites over reads.\n' + '\t\tNOTE: If this step is very slow, ensure the output directory ' + 'is located on a fast read disk (e.g. local SSD). Aggregation can ' + 'be restarted using the megalodon/scripts/run_aggregation.py ' + 'script.').format(num_vars, num_mods)) main_prog_conn, prog_conn = mp.Pipe() prog_p = mp.Process(target=_agg_prog_worker, args=(var_prog_q, mod_prog_q, num_vars, num_mods, prog_conn, suppress_progress), daemon=True) prog_p.start() # join filler processes first if mh.VAR_NAME in outputs: var_filler_p.join() for agg_vars_p in agg_vars_ps: agg_vars_p.join() # send to conn if var_stats_p.is_alive(): main_var_stats_conn.send(True) var_stats_p.join() if mh.MOD_NAME in outputs: for agg_mods_p in agg_mods_ps: agg_mods_p.join() if mod_stats_p.is_alive(): main_mod_stats_conn.send(True) mod_stats_p.join() if prog_p.is_alive(): main_prog_conn.send(True) prog_p.join() return