def _agg_mods_worker(pos_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_ids, write_mod_lp): # functions for profiling purposes def get_pos_data(): return pos_q.get(block=False) def put_mod_site(mod_site): mod_stats_q.put(mod_site) return def do_sleep(): sleep(0.0001) return agg_mods = mods.AggMods(mods_db_fn, mod_agg_info, write_mod_lp) while True: try: pos_data = get_pos_data() except queue.Empty: do_sleep() continue if pos_data is None: break try: mod_site = agg_mods.compute_mod_stats( pos_data, valid_read_ids=valid_read_ids) put_mod_site(mod_site) except mh.MegaError: # no valid reads cover location pass mod_prog_q.put(1) return
def _agg_mods_worker(pos_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_ids, write_mod_lp): # functions for profiling purposes def get_pos_data(): return pos_q.get(block=True, timeout=0.01) def put_mod_site(mod_site): mod_stats_q.put(mod_site) agg_mods = mods.AggMods(mods_db_fn, mod_agg_info, write_mod_lp, load_uuid_index_in_memory=valid_read_ids is not None) while True: try: pos_data = get_pos_data() except queue.Empty: continue if pos_data is None: break try: mod_site = agg_mods.compute_mod_stats( pos_data, valid_read_ids=valid_read_ids) put_mod_site(mod_site) except mh.MegaError: # no valid reads cover location pass mod_prog_q.put(1)
def _agg_mods_worker(locs_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_ids, write_mod_lp): def get_pos_id(): return locs_q.get(block=False) def get_loc_data_from_id(q_pos_id): # function for profiling purposes if q_pos_id is None: return None for pos_id, chrm_id, strand, pos in locs_iter: if q_pos_id == pos_id: return pos_id, chrm_id, strand, pos def get_loc_data(): return locs_q.get(block=False) def put_mod_site(mod_site): # function for profiling purposes mod_stats_q.put(mod_site) return def do_sleep(): # function for profiling purposes sleep(0.0001) return # needed if only pos id is loaded into queue #locs_iter = mods.ModsDb(mods_db_fn).iter_pos_ordered() agg_mods = mods.AggMods(mods_db_fn, mod_agg_info, write_mod_lp) while True: try: #loc_data = get_loc_data_from_id(get_pos_id()) loc_data = get_loc_data() except queue.Empty: do_sleep() continue if loc_data is None: break try: mod_site = agg_mods.compute_mod_stats( loc_data, valid_read_ids=valid_read_ids) put_mod_site(mod_site) except mh.MegaError: # no valid reads cover location pass mod_prog_q.put(1) return
def _agg_mods_worker( pos_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_dbids, write_mod_lp, ): # functions for profiling purposes def get_pos_data(): return pos_q.get(block=True, timeout=0.01) def put_mod_site(mod_site): mod_stats_q.put(mod_site) agg_mods = mods.AggMods( mods_db_fn, mod_agg_info, write_mod_lp, load_uuid_index_in_memory=valid_read_dbids is not None, ) while True: try: pos_data_batch = get_pos_data() except queue.Empty: continue if pos_data_batch is None: break mod_sites_batch = [] batch_size = 0 for pos_data in pos_data_batch: try: mod_site = agg_mods.compute_mod_stats( pos_data, valid_read_dbids=valid_read_dbids) mod_sites_batch.append(mod_site) except mh.MegaError: # no valid reads cover location pass batch_size += len(pos_data[1]) mod_prog_q.put(batch_size) put_mod_site(mod_sites_batch)
def _agg_mods_worker(locs_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_ids, write_mod_lp): agg_mods = mods.AggMods(mods_db_fn, mod_agg_info, write_mod_lp) while True: try: mod_loc = locs_q.get(block=False) except queue.Empty: sleep(0.1) continue if mod_loc is None: break try: mod_site = agg_mods.compute_mod_stats( mod_loc, valid_read_ids=valid_read_ids) mod_stats_q.put(mod_site) except mh.MegaError: # no valid reads cover location pass mod_prog_q.put(1) return
def aggregate_stats(outputs, out_dir, num_ps, write_vcf_lp, het_factors, call_mode, mod_names, mod_agg_info, write_mod_lp, mod_output_fmts, suppress_progress, ref_names_and_lens, valid_read_ids=None, out_suffix=None): if mh.SNP_NAME in outputs and mh.MOD_NAME in outputs: num_ps = max(num_ps // 2, 1) logger = logging.get_logger('agg') num_snps, num_mods, snp_prog_q, mod_prog_q = (0, 0, queue.Queue(), queue.Queue()) if mh.SNP_NAME in outputs: snps_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_SNP_NAME) logger.info('Computing number of unique variants.') num_snps = snps.AggSnps(snps_db_fn).num_uniq() logger.info('Spawning variant aggregation processes.') # create process to collect snp stats from workers snp_stats_q, snp_stats_p, main_snp_stats_conn = mh.create_getter_q( _get_snp_stats_queue, (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp)) # create process to fill snp locs queue snp_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) snp_filler_p = mp.Process(target=_fill_locs_queue, args=(snp_filler_q, snps_db_fn, snps.AggSnps, num_ps), daemon=True) snp_filler_p.start() # create worker processes to aggregate snps snp_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_snps_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_snps_worker, args=(snp_filler_q, snp_stats_q, snp_prog_q, snps_db_fn, write_vcf_lp, het_factors, call_mode, valid_read_ids), daemon=True) p.start() agg_snps_ps.append(p) if mh.MOD_NAME in outputs: mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME) num_mods = mods.AggMods(mods_db_fn).num_uniq() logger.info('Spawning modified base aggregation processes.') # create process to collect mods stats from workers mod_stats_q, mod_stats_p, main_mod_stats_conn = mh.create_getter_q( _get_mod_stats_queue, (out_dir, mod_names, ref_names_and_lens, out_suffix, write_mod_lp, mod_output_fmts)) # create process to fill mod locs queue mod_filler_q = mp.Queue(maxsize=100000) mod_fill_limit = _N_MOD_PROF if _DO_PROF else None mod_filler_p = mp.Process(target=_fill_locs_queue, args=(mod_filler_q, mods_db_fn, mods.AggMods, num_ps, mod_fill_limit), daemon=True) mod_filler_p.start() # create worker processes to aggregate mods mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_mods_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_mods_worker, args=(mod_filler_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_ids, write_mod_lp), daemon=True) p.start() agg_mods_ps.append(p) # create progress process logger.info('Aggregating {} SNPs and {} mod sites over reads.'.format( num_snps, num_mods)) main_prog_conn, prog_conn = mp.Pipe() prog_p = mp.Process(target=_agg_prog_worker, args=(snp_prog_q, mod_prog_q, num_snps, num_mods, prog_conn, suppress_progress), daemon=True) prog_p.start() # join filler processes first if mh.SNP_NAME in outputs: snp_filler_p.join() for agg_snps_p in agg_snps_ps: agg_snps_p.join() # send to conn if snp_stats_p.is_alive(): main_snp_stats_conn.send(True) snp_stats_p.join() if mh.MOD_NAME in outputs: for agg_mods_p in agg_mods_ps: agg_mods_p.join() if mod_stats_p.is_alive(): main_mod_stats_conn.send(True) mod_stats_p.join() if prog_p.is_alive(): main_prog_conn.send(True) prog_p.join() return
def aggregate_stats(outputs, out_dir, num_ps, write_vcf_lp, het_factors, call_mode, mod_agg_info, write_mod_lp, mod_output_fmts, suppress_progress, valid_read_ids=None, out_suffix=None, batch_size=mh.DEFAULT_AGG_BATCH_SIZE): if mh.VAR_NAME in outputs and mh.MOD_NAME in outputs: num_ps = max(num_ps // 2, 1) num_vars, num_mods, var_prog_q, mod_prog_q = (0, 0, queue.Queue(), queue.Queue()) if mh.VAR_NAME in outputs: vars_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_VAR_NAME) agg_vars = variants.AggVars(vars_db_fn, no_indices_in_mem=True) num_vars = agg_vars.num_uniq() ref_names_and_lens = agg_vars.vars_db.get_all_chrm_and_lens() agg_vars.close() LOGGER.info('Spawning variant aggregation processes') # create process to collect var stats from workers var_stats_q, var_stats_p, m_var_stats_conn = mega_mp.create_getter_qpc( _get_var_stats_queue, (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp)) # create process to fill variant locs queue var_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) var_filler_p = mp.Process(target=_fill_locs_queue, args=(var_filler_q, vars_db_fn, variants.AggVars, num_ps, batch_size), daemon=True) var_filler_p.start() # create worker processes to aggregate variants var_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_vars_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_vars_worker, args=(var_filler_q, var_stats_q, var_prog_q, vars_db_fn, write_vcf_lp, het_factors, call_mode, valid_read_ids), daemon=True) p.start() agg_vars_ps.append(p) if mh.MOD_NAME in outputs: mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME) valid_read_dbids = None if valid_read_ids is not None: mods_db = mods.ModsDb(mods_db_fn, in_mem_uuid_to_dbid=True) valid_read_dbids = set() for read_id in valid_read_ids: valid_read_dbids.add(mods_db.get_read_dbid(read_id)) agg_mods = mods.AggMods(mods_db_fn) mod_long_names = agg_mods.get_mod_long_names() num_mods = agg_mods.num_uniq() ref_names_and_lens = agg_mods.mods_db.get_all_chrm_and_lens() agg_mods.close() LOGGER.info('Spawning modified base aggregation processes') # create process to collect mods stats from workers mod_stats_q, mod_stats_p, m_mod_stats_conn = mega_mp.create_getter_qpc( _get_mod_stats_queue, (out_dir, mod_long_names, ref_names_and_lens, out_suffix, write_mod_lp, mod_output_fmts)) # create process to fill mod locs queue mod_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) mod_fill_limit = _N_MOD_PROF if _DO_PROF else None mod_filler_p = mp.Process(target=_fill_locs_queue, args=(mod_filler_q, mods_db_fn, mods.AggMods, num_ps, batch_size, mod_fill_limit), daemon=True) mod_filler_p.start() # create worker processes to aggregate mods mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_mods_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_mods_worker, args=(mod_filler_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_dbids, write_mod_lp), daemon=True) p.start() agg_mods_ps.append(p) if num_vars == 0 and num_mods == 0: LOGGER.warning('No per-read variants or modified base statistics ' + 'found for aggregation.') return if num_vars == 0: LOGGER.info('Aggregating {} per-read modified base statistics'.format( num_mods)) elif num_mods == 0: LOGGER.info('Aggregating {} variants'.format(num_vars)) else: LOGGER.info(('Aggregating {} variants and {} per-read modified base ' + 'statistics').format(num_vars, num_mods)) LOGGER.info( 'NOTE: If this step is very slow, ensure the output directory is ' + 'located on a fast read disk (e.g. local SSD). Aggregation can be ' + 'restarted using the `megalodon_extras aggregate run` command') # create progress process main_prog_conn, prog_conn = mp.Pipe() prog_p = mp.Process(target=_agg_prog_worker, args=(var_prog_q, mod_prog_q, num_vars, num_mods, prog_conn, suppress_progress), daemon=True) prog_p.start() # join filler processes first if mh.VAR_NAME in outputs: var_filler_p.join() for agg_vars_p in agg_vars_ps: agg_vars_p.join() # send to conn if var_stats_p.is_alive(): m_var_stats_conn.send(True) var_stats_p.join() if mh.MOD_NAME in outputs: for agg_mods_p in agg_mods_ps: agg_mods_p.join() if mod_stats_p.is_alive(): m_mod_stats_conn.send(True) mod_stats_p.join() if prog_p.is_alive(): main_prog_conn.send(True) prog_p.join()
def aggregate_stats(outputs, out_dir, num_ps, write_vcf_lp, het_factors, call_mode, mod_agg_info, write_mod_lp, mod_output_fmts, suppress_progress, valid_read_ids=None, out_suffix=None): if mh.VAR_NAME in outputs and mh.MOD_NAME in outputs: num_ps = max(num_ps // 2, 1) num_vars, num_mods, var_prog_q, mod_prog_q = (0, 0, queue.Queue(), queue.Queue()) if mh.VAR_NAME in outputs: vars_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_VAR_NAME) agg_vars = variants.AggVars(vars_db_fn, load_in_mem_indices=False) num_vars = agg_vars.num_uniq() ref_names_and_lens = agg_vars.vars_db.get_all_chrm_and_lens() agg_vars.close() LOGGER.info('Spawning variant aggregation processes.') # create process to collect var stats from workers var_stats_q, var_stats_p, main_var_stats_conn = mh.create_getter_q( _get_var_stats_queue, (out_dir, ref_names_and_lens, out_suffix, write_vcf_lp)) # create process to fill variant locs queue var_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) var_filler_p = mp.Process(target=_fill_locs_queue, args=(var_filler_q, vars_db_fn, variants.AggVars, num_ps), daemon=True) var_filler_p.start() # create worker processes to aggregate variants var_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_vars_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_vars_worker, args=(var_filler_q, var_stats_q, var_prog_q, vars_db_fn, write_vcf_lp, het_factors, call_mode, valid_read_ids), daemon=True) p.start() agg_vars_ps.append(p) if mh.MOD_NAME in outputs: mods_db_fn = mh.get_megalodon_fn(out_dir, mh.PR_MOD_NAME) agg_mods = mods.AggMods(mods_db_fn, load_in_mem_indices=False) mod_long_names = agg_mods.get_mod_long_names() num_mods = agg_mods.num_uniq() ref_names_and_lens = agg_mods.mods_db.get_all_chrm_and_lens() agg_mods.close() LOGGER.info('Spawning modified base aggregation processes.') # create process to collect mods stats from workers mod_stats_q, mod_stats_p, main_mod_stats_conn = mh.create_getter_q( _get_mod_stats_queue, (out_dir, mod_long_names, ref_names_and_lens, out_suffix, write_mod_lp, mod_output_fmts)) # create process to fill mod locs queue mod_filler_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) mod_fill_limit = _N_MOD_PROF if _DO_PROF else None mod_filler_p = mp.Process(target=_fill_locs_queue, args=(mod_filler_q, mods_db_fn, mods.AggMods, num_ps, mod_fill_limit), daemon=True) mod_filler_p.start() # create worker processes to aggregate mods mod_prog_q = mp.Queue(maxsize=mh._MAX_QUEUE_SIZE) agg_mods_ps = [] for _ in range(num_ps): p = mp.Process(target=_agg_mods_worker, args=(mod_filler_q, mod_stats_q, mod_prog_q, mods_db_fn, mod_agg_info, valid_read_ids, write_mod_lp), daemon=True) p.start() agg_mods_ps.append(p) # create progress process LOGGER.info( ('Aggregating {} variants and {} modified base sites over reads.\n' + '\t\tNOTE: If this step is very slow, ensure the output directory ' + 'is located on a fast read disk (e.g. local SSD). Aggregation can ' + 'be restarted using the megalodon/scripts/run_aggregation.py ' + 'script.').format(num_vars, num_mods)) main_prog_conn, prog_conn = mp.Pipe() prog_p = mp.Process(target=_agg_prog_worker, args=(var_prog_q, mod_prog_q, num_vars, num_mods, prog_conn, suppress_progress), daemon=True) prog_p.start() # join filler processes first if mh.VAR_NAME in outputs: var_filler_p.join() for agg_vars_p in agg_vars_ps: agg_vars_p.join() # send to conn if var_stats_p.is_alive(): main_var_stats_conn.send(True) var_stats_p.join() if mh.MOD_NAME in outputs: for agg_mods_p in agg_mods_ps: agg_mods_p.join() if mod_stats_p.is_alive(): main_mod_stats_conn.send(True) mod_stats_p.join() if prog_p.is_alive(): main_prog_conn.send(True) prog_p.join() return