def main(): args = get_parser().parse_args() megalodon.mkdir(args.output_megalodon_results_dir, False) out_vars_db = variants.VarsDb( mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_VAR_NAME), read_only=False, loc_index_in_memory=not args.var_locations_on_disk) for mega_dir in args.megalodon_results_dirs: # full read only mode with no indices read into memory vars_db = variants.VarsDb(mh.get_megalodon_fn(mega_dir, mh.PR_VAR_NAME), read_only=True, chrm_index_in_memory=False, alt_index_in_memory=False, uuid_index_in_memory=False) for (score, uuid, strand, alt_seq, ref_seq, pos, var_name, test_end, test_start, chrm, chrm_len) in vars_db.iter_data(): chrm_id = out_vars_db.get_chrm_id_or_insert(chrm, chrm_len) loc_id = out_vars_db.get_loc_id_or_insert(chrm_id, test_start, test_end, pos, ref_seq, var_name) alt_id = out_vars_db.get_alt_id_or_insert(alt_seq) read_id = out_vars_db.get_read_id_or_insert(uuid) out_vars_db.insert_data(score, loc_id, alt_id, read_id) if out_vars_db.chrm_idx_in_mem: out_vars_db.create_chrm_index() if out_vars_db.loc_idx_in_mem: out_vars_db.create_loc_index() if out_vars_db.alt_idx_in_mem: out_vars_db.create_alt_index() out_vars_db.create_data_covering_index() out_vars_db.close()
def main(): args = get_parser().parse_args() vars_db = variants.VarsDb(mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_NAME), uuid_strand_index_in_memory=True) vars_txt_fp = open( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_TXT_NAME) if args.out_filename is None else args.out_filename, 'w') vars_txt_fp.write('\t'.join(vars_db.text_field_names) + '\n') for (loc_id, loc_chrm, pos, ref_seq, var_name, has_context_base) in tqdm(vars_db.iter_locs(), total=vars_db.get_num_uniq_var_loc(), smoothing=0): pr_var_stats = vars_db.get_loc_stats( (loc_id, loc_chrm, pos, ref_seq, var_name)) alt_type_stats = defaultdict(dict) for r_stats in pr_var_stats: alt_type_stats[r_stats.read_id][r_stats.alt_seq] = (r_stats.score, r_stats.chrm) var_out_text = '' for read_id, r_var_stats in alt_type_stats.items(): uuid, strand = vars_db.get_uuid_strand(read_id) alt_lps = np.array(list(zip(*r_var_stats.values()))[0]) with np.errstate(divide='ignore'): ref_lp = np.log1p(-np.exp(alt_lps).sum()) var_out_text += '\n'.join( (('\t'.join('{}' for _ in vars_db.text_field_names)).format( uuid, chrm, strand, pos, ref_lp, alt_lp, ref_seq, alt_seq, var_name) for alt_seq, (alt_lp, chrm) in r_var_stats.items())) + '\n' vars_txt_fp.write(var_out_text) return
def _main(args): mods_db = mods.ModsDb( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME)) mods_txt_fp = open( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME) if args.out_filename is None else args.out_filename, 'w') mods_txt_fp.write('\t'.join(mods_db.text_field_names) + '\n') for pos_id, pos_chrm, strand, pos in tqdm( mods_db.iter_pos(), total=mods_db.get_num_uniq_mod_pos(), smoothing=0): pr_mod_stats = mods_db.get_pos_stats( (pos_id, pos_chrm, strand, pos), return_uuids=True) mod_type_stats = defaultdict(dict) for r_stats in pr_mod_stats: mod_type_stats[r_stats.read_id][r_stats.mod_base] = ( r_stats.score, r_stats.raw_motif, r_stats.motif_pos, r_stats.chrm) mod_out_text = '' for read_id, r_mod_stats in mod_type_stats.items(): mod_lps = np.array(list(zip(*r_mod_stats.values()))[0]) with np.errstate(divide='ignore'): can_lp = np.log1p(-np.exp(mod_lps).sum()) mod_out_text += '\n'.join(( ('\t'.join('{}' for _ in mods_db.text_field_names)).format( read_id, chrm, strand, pos, mod_lp, can_lp, mod_base, '{}:{}'.format(raw_motif, motif_pos)) for mod_base, (mod_lp, raw_motif, motif_pos, chrm) in r_mod_stats.items())) + '\n' mods_txt_fp.write(mod_out_text)
def main(): args = get_parser().parse_args() megalodon.mkdir(args.output_megalodon_results_dir, False) out_mods_db = mods.ModsDb( mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_MOD_NAME), read_only=False, pos_index_in_memory=not args.mod_positions_on_disk) for mega_dir in args.megalodon_results_dirs: # full read only mode with no indices read into memory mods_db = mods.ModsDb(mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME), read_only=True, chrm_index_in_memory=False, mod_index_in_memory=False, uuid_index_in_memory=False) for (score, uuid, mod_base, motif, motif_pos, raw_motif, strand, pos, chrm, chrm_len) in mods_db.iter_data(): chrm_id = out_mods_db.get_chrm_id_or_insert(chrm, chrm_len) pos_id = out_mods_db.get_pos_id_or_insert(chrm_id, strand, pos) mod_base_id = out_mods_db.get_mod_base_id_or_insert( mod_base, motif, motif_pos, raw_motif) read_id = out_mods_db.get_read_id_or_insert(uuid) out_mods_db.insert_data(score, pos_id, mod_base_id, read_id) if out_mods_db.chrm_idx_in_mem: out_mods_db.create_chrm_index() if out_mods_db.pos_idx_in_mem: out_mods_db.create_pos_index() if out_mods_db.mod_idx_in_mem: out_mods_db.create_mod_index() out_mods_db.create_data_covering_index() out_mods_db.close()
def _main(args): logging.init_logger(quiet=args.quiet) if (args.ground_truth_data is None and args.control_megalodon_results_dir is None): LOGGER.error( "Must provide either --control-megalodon-results-dir or " + "--ground-truth-data") sys.exit() db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME) if args.ground_truth_data is not None: LOGGER.info("Parsing ground truth data") gt_mod_pos, gt_can_pos = mh.parse_ground_truth_file( args.ground_truth_data, include_strand=args.strand_specific_sites) LOGGER.info( ("Loaded ground truth data with {} modified sites and {} " + "canonical sites.").format(len(gt_mod_pos), len(gt_can_pos))) LOGGER.info("Reading ground truth modified base statistics from " + "database.") all_mod_llrs, all_can_llrs = mods.extract_stats_at_valid_sites( db_fn, [gt_mod_pos, gt_can_pos], include_strand=args.strand_specific_sites, ) else: LOGGER.info("Reading ground truth modified base statistics from " + "database") all_mod_llrs = mods.extract_all_stats(db_fn) LOGGER.info("Reading ground truth modified base statistics from " + "canonical sample database") all_can_llrs = mods.extract_all_stats( mh.get_megalodon_fn(args.control_megalodon_results_dir, mh.PR_MOD_NAME)) mod_summary = [( mod, len(all_mod_llrs[mod]) if mod in all_mod_llrs else 0, len(all_can_llrs[mod]) if mod in all_can_llrs else 0, ) for mod in set(all_mod_llrs).union(all_can_llrs)] LOGGER.info("Data summary:\n\tmod\tmod_N\tcan_N\n" + "\n".join("\t" + "\t".join(map(str, x)) for x in mod_summary)) output_mods_data( all_mod_llrs, all_can_llrs, args.modified_bases_set, args.exclude_modified_bases, args.out_filename, )
def _get_bc_queue(bc_q, bc_conn, out_dir, bc_fmt, do_output_mods, mod_long_names): bc_fp = open(mh.get_megalodon_fn(out_dir, mh.BC_NAME) + '.' + bc_fmt, 'w') if do_output_mods: mods_fp = h5py.File(mh.get_megalodon_fn(out_dir, mh.BC_MODS_NAME)) mods_fp.create_group('Reads') mods_fp.create_dataset('mod_long_names', data=np.array(mod_long_names, dtype='S'), dtype=h5py.special_dtype(vlen=str)) while True: try: # TODO add quality output to add fastq option read_id, r_seq, mods_scores = bc_q.get(block=False) bc_fp.write('>{}\n{}\n'.format(read_id, r_seq)) bc_fp.flush() if do_output_mods: try: mods_fp.create_dataset('Reads/' + read_id, data=mods_scores, compression="gzip") except RuntimeError: # same read_id encountered previously pass except queue.Empty: if bc_conn.poll(): break sleep(0.1) continue while not bc_q.empty(): read_id, r_seq, mods_scores = bc_q.get(block=False) bc_fp.write('>{}\n{}\n'.format(read_id, r_seq)) bc_fp.flush() if do_output_mods: try: mods_fp.create_dataset('Reads/' + read_id, data=mods_scores, compression="gzip") except RuntimeError: # same read_id encountered previously pass bc_fp.close() if do_output_mods: mods_fp.close() return
def _main(args): mods_db = mods.ModsDb( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME), in_mem_dbid_to_uuid=True, ) mods_txt_fp = open( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME) if args.out_filename is None else args.out_filename, "w", ) mods_txt_fp.write("\t".join(mods_db.text_field_names) + "\n") rec_tmplt = "\t".join("{}" for _ in mods_db.text_field_names) + "\n" bar = tqdm( desc="Processing Per-read Data", unit="per-read calls", total=mods_db.get_num_uniq_stats(), smoothing=0, dynamic_ncols=True, ) for (chrm, strand, pos), pos_lps in mods_db.iter_pos_scores(convert_pos=True): bar.update(len(pos_lps)) str_strand = mh.int_strand_to_str(strand) mod_out_text = "" prev_dbid = None mod_bs, r_lps = [], [] for read_dbid, mod_dbid, lp in sorted(pos_lps): if prev_dbid != read_dbid and prev_dbid is not None: uuid = mods_db.get_uuid(prev_dbid) # compute and store log likelihood ratios with np.errstate(divide="ignore"): can_lp = np.log1p(-np.exp(r_lps).sum()) for mod_b, r_lp in zip(mod_bs, r_lps): mod_out_text += rec_tmplt.format(uuid, chrm, str_strand, pos, r_lp, can_lp, mod_b) mod_bs, r_lps = [], [] prev_dbid = read_dbid mod_bs.append(mods_db.get_mod_base(mod_dbid)) r_lps.append(lp) uuid = mods_db.get_uuid(prev_dbid) # compute and store log likelihood ratios with np.errstate(divide="ignore"): can_lp = np.log1p(-np.exp(r_lps).sum()) for mod_b, r_lp in zip(mod_bs, r_lps): mod_out_text += rec_tmplt.format(uuid, chrm, str_strand, pos, r_lp, can_lp, mod_b) mods_txt_fp.write(mod_out_text) mods_txt_fp.close()
def _get_snp_stats_queue(snp_stats_q, snp_conn, out_dir, ref_names_and_lens, out_suffix, write_vcf_lp): agg_snp_fn = mh.get_megalodon_fn(out_dir, mh.SNP_NAME) if out_suffix is not None: base_fn, fn_ext = os.path.splitext(agg_snp_fn) agg_snp_fn = base_fn + '.' + out_suffix + fn_ext agg_snp_fp = snps.VcfWriter(agg_snp_fn, 'w', ref_names_and_lens=ref_names_and_lens, write_vcf_lp=write_vcf_lp) while True: try: snp_var = snp_stats_q.get(block=False) if snp_var is None: continue agg_snp_fp.write_variant(snp_var) except queue.Empty: if snp_conn.poll(): break sleep(0.001) continue while not snp_stats_q.empty(): snp_var = snp_stats_q.get(block=False) agg_snp_fp.write_variant(snp_var) agg_snp_fp.close() return
def report_acc_metrics(res_dir, out_fp): try: bc_dat = pd.read_csv(mh.get_megalodon_fn(res_dir, mh.MAP_SUMM_NAME), sep='\t') bc_acc = bc_dat['pct_identity'] parsim_acc = 100 * (bc_dat['num_match'] - bc_dat['num_ins']) / \ (bc_dat['num_align'] - bc_dat['num_ins']) mean_bc_acc = np.mean(bc_acc) med_bc_acc = np.median(bc_acc) # crude mode by rounding to 1 decimal uniq_acc, acc_counts = np.unique(np.around(bc_acc, 1), return_counts=True) mode_bc_acc = uniq_acc[np.argmax(acc_counts)] out_fp.write( ('Mapping metrics for {} ({} mapped reads):\n\t' + 'Mean Pct. Identity: {:.4f}\n\t' + 'Median Pct. Identity: {:.4f}\n\t' + 'Mode Pct. Identity: {:.1f}\n').format(res_dir, bc_dat.shape[0], mean_bc_acc, med_bc_acc, mode_bc_acc)) except FileNotFoundError: bc_acc = parsim_acc = None if VERBOSE: sys.stderr.write( 'WARNING: Mappings not found for {}\n'.format(res_dir)) return bc_acc, parsim_acc
def _main(args): log_suffix = ('aggregation' if args.output_suffix is None else 'aggregation.' + args.output_suffix) logging.init_logger(args.megalodon_directory, out_suffix=log_suffix) LOGGER.debug('Command: """' + ' '.join(sys.argv) + '"""') if args.mod_aggregate_method == mh.MOD_EM_NAME: mod_agg_info = mods.AGG_INFO(mh.MOD_EM_NAME, None) elif args.mod_aggregate_method == mh.MOD_BIN_THRESH_NAME: mod_agg_info = mods.AGG_INFO( mh.MOD_BIN_THRESH_NAME, args.mod_binary_threshold) valid_read_ids = mh.parse_read_ids(args.read_ids_filename) aggregate.aggregate_stats( args.outputs, args.megalodon_directory, args.processes, args.write_vcf_log_probs, args.heterozygous_factors, variants.HAPLIOD_MODE if args.haploid else variants.DIPLOID_MODE, mod_agg_info, args.write_mod_log_probs, args.mod_output_formats, args.suppress_progress, valid_read_ids, args.output_suffix, args.aggregate_batch_size) if mh.VAR_NAME in args.outputs: LOGGER.info('Sorting output variant file') variant_fn = mh.add_fn_suffix( mh.get_megalodon_fn(args.megalodon_directory, mh.VAR_NAME), args.output_suffix) sort_variant_fn = mh.add_fn_suffix(variant_fn, 'sorted') variants.sort_variants(variant_fn, sort_variant_fn) LOGGER.info('Indexing output variant file') variants.index_variants(sort_variant_fn)
def report_acc_metrics(res_dir, out_fp, samp_lab): try: bc_data = mapping.parse_map_summary_file( mh.get_megalodon_fn(res_dir, mh.MAP_SUMM_NAME)) bc_acc = np.array([r_data.pct_identity for r_data in bc_data]) parsim_acc = np.array([ 100 * (r_data.num_match - r_data.num_ins) / (r_data.num_align - r_data.num_ins) for r_data in bc_data ]) aligned_lens = np.array( [r_data.num_align - r_data.num_ins for r_data in bc_data]) # crude mode by rounding to 1 decimal uniq_acc, acc_counts = np.unique(np.around(bc_acc, 1), return_counts=True) mode_bc_acc = uniq_acc[np.argmax(acc_counts)] out_fp.write( ACC_METRICS_TMPLT.format( np.median(bc_acc), np.mean(bc_acc), mode_bc_acc, len(bc_data), np.max(aligned_lens), np.median(aligned_lens), np.mean(aligned_lens), samp_lab, )) except FileNotFoundError: bc_acc = parsim_acc = aligned_lens = None LOGGER.info("Mappings not found for {}".format(res_dir)) return bc_acc, parsim_acc, aligned_lens
def _main(args): logging.init_logger(log_fn=args.log_filename) if args.ground_truth_cov_min < 1: LOGGER.warning( "--ground-truth-cov-min must be 1 or greater. " + "Setting to 1." ) args.ground_truth_cov_min = 1 if args.nanopore_cov_min < 1: LOGGER.warning( "--nanopore-cov-min must be 1 or greater. " + "Setting to 1." ) args.nanopore_cov_min = 1 LOGGER.info("Checking for consistent contig names") mod_db_fn = mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_NAME) check_matching_attrs( args.ground_truth_bed, args.strand_offset, mod_db_fn, args.mod_bases ) LOGGER.info("Processing batches") process_all_batches( args.processes, args.batch_size, args.ground_truth_bed, args.out_low_coverage_sites, args.out_per_site_mod_thresholds, mod_db_fn, args.strand_offset, args.ground_truth_cov_min, args.nanopore_cov_min, args.mod_bases, args.valid_sites, )
def _get_var_stats_queue(var_stats_q, var_conn, out_dir, ref_names_and_lens, out_suffix, write_vcf_lp): agg_var_fn = mh.get_megalodon_fn(out_dir, mh.VAR_NAME) if out_suffix is not None: base_fn, fn_ext = os.path.splitext(agg_var_fn) agg_var_fn = base_fn + '.' + out_suffix + fn_ext agg_var_fp = variants.VcfWriter(agg_var_fn, 'w', ref_names_and_lens=ref_names_and_lens, write_vcf_lp=write_vcf_lp) while True: try: var_var_batch = var_stats_q.get(block=True, timeout=0.01) if var_var_batch is None: continue for var_var in var_var_batch: agg_var_fp.write_variant(var_var) except queue.Empty: if var_conn.poll(): break continue while not var_stats_q.empty(): var_var_batch = var_stats_q.get(block=False) for var_var in var_var_batch: agg_var_fp.write_variant(var_var) agg_var_fp.close()
def parse_control_mods(args, out_fp, all_valid_sites): ctrl_acc = ctrl_parsim_acc = ctrl_dat = gt_dat = mod_chrm_sw = None if args.control_megalodon_results_dir is not None: if VERBOSE: sys.stderr.write('Reading control mods data\n') ctrl_acc, ctrl_parsim_acc = report_acc_metrics( args.control_megalodon_results_dir, out_fp) try: ctrl_dat = pd.read_csv( mh.get_megalodon_fn(args.control_megalodon_results_dir, mh.PR_MOD_TXT_NAME), sep='\t') if all_valid_sites is not None: # filter to valid sites ctrl_idx = ctrl_dat.set_index(['chrm', 'pos']).index ctrl_dat = ctrl_dat[ctrl_idx.isin(all_valid_sites)] except FileNotFoundError: ctrl_dat = None elif args.ground_truth_data is not None: if VERBOSE: sys.stderr.write('Reading ground truth data\n') gt_dat = pd.read_csv( args.ground_truth_data, header=None, names=['chrm', 'pos', 'is_mod']) elif args.mod_chrms_startswith is not None: mod_chrm_sw = args.mod_chrms_startswith else: sys.stderr.write( '*' * 20 + ' WARNING: No modified base ground truth provided.\n') return ctrl_acc, ctrl_parsim_acc, ctrl_dat, gt_dat, mod_chrm_sw
def _main(args): mh.mkdir(args.output_megalodon_results_dir, args.overwrite) logging.init_logger(args.output_megalodon_results_dir) logger = logging.get_logger() logger.info('Opening new sequence variant statistics database') out_vars_db = variants.VarsDb( mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_VAR_NAME), read_only=False, loc_index_in_memory=not args.var_locations_on_disk, uuid_index_in_memory=True) for mega_dir in args.megalodon_results_dirs: logger.info( 'Adding sequence variant statistics from {}'.format(mega_dir)) # full read only mode with no indices read into memory vars_db = variants.VarsDb(mh.get_megalodon_fn(mega_dir, mh.PR_VAR_NAME), read_only=True, chrm_index_in_memory=False, alt_index_in_memory=False, uuid_index_in_memory=False) bar = tqdm(desc=mega_dir, total=vars_db.get_num_uniq_stats(), smoothing=0, dynamic_ncols=True) for (score, uuid, strand, alt_seq, ref_seq, pos, var_name, test_end, test_start, chrm, chrm_len) in vars_db.iter_data(): chrm_id = out_vars_db.get_chrm_id_or_insert(chrm, chrm_len) loc_id = out_vars_db.get_loc_id_or_insert(chrm_id, test_start, test_end, pos, ref_seq, var_name) alt_id = out_vars_db.get_alt_id_or_insert(alt_seq) read_id = out_vars_db.get_read_id_or_insert(uuid) out_vars_db.insert_data(score, loc_id, alt_id, read_id) bar.update() bar.close() logger.info('Creating indices and closing database') if out_vars_db.chrm_idx_in_mem: out_vars_db.create_chrm_index() if out_vars_db.loc_idx_in_mem: out_vars_db.create_loc_index() if out_vars_db.alt_idx_in_mem: out_vars_db.create_alt_index() out_vars_db.create_data_covering_index() out_vars_db.close()
def _main(args): mh.mkdir(args.output_megalodon_results_dir, args.overwrite) logging.init_logger(args.output_megalodon_results_dir) LOGGER.info('Opening new modified base statistics database') out_mods_db_fn = mh.get_megalodon_fn(args.output_megalodon_results_dir, mh.PR_MOD_NAME) out_mods_db = mods.ModsDb( out_mods_db_fn, read_only=False, init_db_tables=True, in_mem_chrm_to_dbid=True, in_mem_mod_to_dbid=True, in_mem_uuid_to_dbid=True, in_mem_pos_to_dbid=True, force_uint32_pos_to_dbid=args.force_uint32_pos_index, db_safety=args.database_safety) in_mod_db_fns = [mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME) for mega_dir in args.megalodon_results_dirs] LOGGER.info( 'Merging will proceed in five stages:\n\t1) chrmosomes\n\t2) ' + 'modified base definitions\n\t3) read identifiers\n\t4) reference ' + 'positions\n\t5) modified base statistics') insert_chrms(in_mod_db_fns, out_mods_db) insert_mods(in_mod_db_fns, out_mods_db) if args.single_process: insert_reads(in_mod_db_fns, out_mods_db) else: insert_reads_mp(in_mod_db_fns, out_mods_db, args.data_batch_size) if args.single_process: insert_pos(in_mod_db_fns, out_mods_db, args.data_batch_size) else: insert_pos_mp(in_mod_db_fns, out_mods_db, args.data_batch_size) out_mods_db.db.commit() if args.single_process: insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size) else: insert_data_mp( in_mod_db_fns, out_mods_db, out_mods_db_fn, args.data_batch_size, args.max_processes, args.force_uint32_pos_index, db_safety=args.database_safety) out_mods_db.db.commit() LOGGER.info( 'Creating data covering index for efficient searching by position') out_mods_db.create_data_covering_index() out_mods_db.db.commit() out_mods_db.close()
def _main(args): vars_db = variants.VarsDb( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_NAME), uuid_strand_index_in_memory=True, ) vars_txt_fp = open( mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_VAR_TXT_NAME) if args.out_filename is None else args.out_filename, "w", ) vars_txt_fp.write("\t".join(vars_db.text_field_names) + "\n") for (loc_id, loc_chrm, pos, ref_seq, var_name, test_start) in tqdm(vars_db.iter_locs(), total=vars_db.get_num_uniq_var_loc(), smoothing=0): pr_var_stats = vars_db.get_loc_stats( (loc_id, loc_chrm, pos, ref_seq, var_name, test_start)) alt_type_stats = defaultdict(dict) for r_stats in pr_var_stats: alt_type_stats[r_stats.read_id][r_stats.alt_seq] = ( r_stats.score, r_stats.chrm, ) var_out_text = "" for read_id, r_var_stats in alt_type_stats.items(): uuid, strand = vars_db.get_uuid_strand(read_id) alt_lps = np.array(list(zip(*r_var_stats.values()))[0]) with np.errstate(divide="ignore"): ref_lp = np.log1p(-np.exp(alt_lps).sum()) var_out_text += ("\n".join( (("\t".join("{}" for _ in vars_db.text_field_names)).format( uuid, chrm, strand, pos, ref_lp, alt_lp, ref_seq, alt_seq, var_name, ) for alt_seq, (alt_lp, chrm) in r_var_stats.items())) + "\n") vars_txt_fp.write(var_out_text) return
def post_process_mapping(out_dir, map_fmt, ref_fn): map_bn = mh.get_megalodon_fn(out_dir, mh.MAP_NAME) map_fn = map_bn + '.' + map_fmt map_sort_fn = map_bn + '.sorted.bam' map_p = mp.Process(target=mapping.sort_and_index_mapping, args=(map_fn, map_sort_fn, ref_fn), daemon=True) map_p.start() sleep(0.01) return map_p
def post_process_whatshap(out_dir, map_fmt, ref_fn): whatshap_map_bn = mh.get_megalodon_fn(out_dir, mh.WHATSHAP_MAP_NAME) whatshap_map_fn = whatshap_map_bn + '.' + map_fmt whatshap_sort_fn = whatshap_map_bn + '.sorted.bam' whatshap_p = mp.Process(target=mapping.sort_and_index_mapping, args=(whatshap_map_fn, whatshap_sort_fn, ref_fn), daemon=True) whatshap_p.start() sleep(0.01) return whatshap_sort_fn, whatshap_p
def parse_mod_data(args): if VERBOSE: sys.stderr.write('Reading megalodon data\n') try: mod_dat = pd.read_csv(mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME), sep='\t') except FileNotFoundError: sys.write('ERROR: Must provide a valid Megalodon result directory.') sys.exit(1) return mod_dat
def write_signal_mappings(sig_map_q, sig_map_conn, ref_out_info, aux_failed_q): def apply_sig_map_offset(read_mapping): """Apply signal mapping shift to center coarse mappings to a registered signal based mapping. """ if ( ref_out_info.sig_map_offset is not None and ref_out_info.sig_map_offset != 0 ): if ref_out_info.sig_map_offset > 0: # clip beginning of signal mapping and end of reference to # shift signal assignments to the left read_mapping[0]["Ref_to_signal"] = read_mapping[0][ "Ref_to_signal" ][ref_out_info.sig_map_offset :] read_mapping[0]["Reference"] = read_mapping[0]["Reference"][ : -ref_out_info.sig_map_offset ] else: # clip end of signal mapping and beginning of reference to # shift signal assignments to the right read_mapping[0]["Ref_to_signal"] = read_mapping[0][ "Ref_to_signal" ][: ref_out_info.sig_map_offset] read_mapping[0]["Reference"] = read_mapping[0]["Reference"][ -ref_out_info.sig_map_offset : ] return read_mapping def iter_mappings(): workers_active = True LOGGER.debug("GetterInitComplete") while workers_active or not sig_map_q.empty(): try: read_mapping = sig_map_q.get(timeout=0.1) yield apply_sig_map_offset(read_mapping) except queue.Empty: if sig_map_conn.poll(): workers_active = False try: LOGGER.debug("GetterStarting") prepare_mapping_funcs.generate_output_from_results( iter_mappings(), mh.get_megalodon_fn(ref_out_info.out_dir, mh.SIG_MAP_NAME), ref_out_info.alphabet_info, verbose=False, ) LOGGER.debug("GetterClosing") except Exception as e: aux_failed_q.put( ("SigMapProcessingError", str(e), traceback.format_exc()) )
def open_alignment_out_file(out_dir, map_fmt, ref_names_and_lens, ref_fn): map_fn = mh.get_megalodon_fn(out_dir, mh.MAP_NAME) + '.' + map_fmt if map_fmt == 'bam': w_mode = 'wb' elif map_fmt == 'cram': w_mode = 'wc' elif map_fmt == 'sam': w_mode = 'w' else: raise mh.MegaError('Invalid mapping output format') return pysam.AlignmentFile(map_fn, w_mode, reference_names=ref_names_and_lens[0], reference_lengths=ref_names_and_lens[1], reference_filename=ref_fn)
def parse_mod_data(args, out_fp): if VERBOSE: sys.stderr.write('Reading megalodon data\n') mod_acc, parsim_acc = report_acc_metrics(args.megalodon_results_dir, out_fp) try: mod_dat = pd.read_csv(mh.get_megalodon_fn(args.megalodon_results_dir, mh.PR_MOD_TXT_NAME), sep='\t') except FileNotFoundError: mod_dat = None return mod_dat, mod_acc, parsim_acc
def _main(args): logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix) LOGGER.debug('Command: """' + " ".join(sys.argv) + '"""') mods_db_fn = mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME) mods_db = mods.ModsDb(mods_db_fn, read_only=False) try: mods_db.check_data_covering_index_exists() LOGGER.info("Modified bases database index already exists") except mh.MegaError: LOGGER.info("Creating modified bases database index") mods_db.create_data_covering_index() LOGGER.debug("Closing database") mods_db.close()
def _get_mod_stats_queue( mod_stats_q, mod_conn, out_dir, mod_names, ref_names_and_lens, out_suffix, write_mod_lp, mod_output_fmts, ): def get_mod_site(): # function for profiling purposes return mod_stats_q.get(block=True, timeout=0.01) agg_mod_bn = mh.get_megalodon_fn(out_dir, mh.MOD_NAME) if out_suffix is not None: agg_mod_bn += "." + out_suffix agg_mod_fps = [] if mh.MOD_BEDMETHYL_NAME in mod_output_fmts: agg_mod_fps.append(mods.ModBedMethylWriter(agg_mod_bn, mod_names, "w")) if mh.MOD_VCF_NAME in mod_output_fmts: agg_mod_fps.append( mods.ModVcfWriter( agg_mod_bn, mod_names, "w", ref_names_and_lens=ref_names_and_lens, write_mod_lp=write_mod_lp, )) if mh.MOD_WIG_NAME in mod_output_fmts: agg_mod_fps.append(mods.ModWigWriter(agg_mod_bn, mod_names, "w")) while True: try: mod_sites_batch = get_mod_site() for mod_site in mod_sites_batch: for agg_mod_fp in agg_mod_fps: agg_mod_fp.write_mod_site(mod_site) except queue.Empty: if mod_conn.poll(): break continue while not mod_stats_q.empty(): mod_sites_batch = mod_stats_q.get(block=False) for mod_site in mod_sites_batch: for agg_mod_fp in agg_mod_fps: agg_mod_fp.write_mod_site(mod_site) for agg_mod_fp in agg_mod_fps: agg_mod_fp.close()
def open_alignment_out_file(self): map_fn = '{}.{}'.format( mh.get_megalodon_fn(self.out_dir, mh.MAP_NAME), self.map_fmt) w_mode = get_mapping_mode(self.map_fmt) try: align_file = pysam.AlignmentFile( map_fn, w_mode, reference_names=self.ref_names_and_lens[0], reference_lengths=self.ref_names_and_lens[1], reference_filename=self.cram_ref_fn) except ValueError: LOGGER.error( 'Failed to open alignment file for writing.\n\t\tFor CRAM ' + 'output, if FASTA is compressed ensure it is with bgzip or ' + 'if --reference is a minimap2 index see --cram-reference.') raise mh.MegaError('Reference loading error.') return align_file
def _get_mod_stats_queue(mod_stats_q, mod_conn, out_dir, mod_names, ref_names_and_lens, out_suffix, write_mod_lp, mod_output_fmts): def get_mod_site(): # function for profiling purposes return mod_stats_q.get(block=False) def do_sleep(): # function for profiling purposes sleep(0.001) return agg_mod_bn = mh.get_megalodon_fn(out_dir, mh.MOD_NAME) if out_suffix is not None: agg_mod_bn += '.' + out_suffix agg_mod_fps = [] if mh.MOD_BEDMETHYL_NAME in mod_output_fmts: agg_mod_fps.append(mods.ModBedMethylWriter(agg_mod_bn, mod_names, 'w')) if mh.MOD_VCF_NAME in mod_output_fmts: agg_mod_fps.append( mods.ModVcfWriter(agg_mod_bn, mod_names, 'w', ref_names_and_lens=ref_names_and_lens, write_mod_lp=write_mod_lp)) if mh.MOD_WIG_NAME in mod_output_fmts: agg_mod_fps.append(mods.ModWigWriter(agg_mod_bn, mod_names, 'w')) while True: try: mod_site = get_mod_site() for agg_mod_fp in agg_mod_fps: agg_mod_fp.write_mod_site(mod_site) except queue.Empty: if mod_conn.poll(): break do_sleep() continue while not mod_stats_q.empty(): mod_site = mod_stats_q.get(block=False) for agg_mod_fp in agg_mod_fps: agg_mod_fp.write_mod_site(mod_site) for agg_mod_fp in agg_mod_fps: agg_mod_fp.close() return
def _main(args): mh.mkdir(args.output_megalodon_results_dir, args.overwrite) logging.init_logger(args.output_megalodon_results_dir) LOGGER.info('Extracting mods and chrms from input databases') in_mod_db_fns = [ mh.get_megalodon_fn(mega_dir, mh.PR_MOD_NAME) for mega_dir in args.megalodon_results_dirs ] alphabet, mod_long_names = extract_mods(in_mod_db_fns) ref_names_and_lens = extract_chrms(in_mod_db_fns) LOGGER.info('Opening new per-read modified base statistics database') model_info = backends.DetachedModelInfo(alphabet=alphabet, mod_long_names=mod_long_names) mods_info = mods.ModInfo(model_info, out_dir=args.output_megalodon_results_dir) mods.init_mods_db(mods_info, ref_names_and_lens) # load uuids in memory in main out db only in single process mode. # else worker threads only have to load uuid lookup tables out_mods_db = mods.ModsDb(mods_info.mods_db_fn, read_only=False, in_mem_uuid_to_dbid=args.single_process) LOGGER.info('Inserting read UUIDs from input databases') if args.single_process: insert_reads(in_mod_db_fns, out_mods_db) else: insert_reads_mp(in_mod_db_fns, out_mods_db) # commit so read uuids are available to worker processes out_mods_db.commit() LOGGER.info('Inserting per-read calls from input databases') if args.single_process: insert_data(in_mod_db_fns, out_mods_db, args.data_batch_size) else: insert_data_mp(in_mod_db_fns, out_mods_db, mods_info.mods_db_fn, args.data_batch_size, args.max_processes) out_mods_db.commit() LOGGER.info( 'Creating data covering index for efficient iteration by position') out_mods_db.create_data_covering_index() out_mods_db.commit() out_mods_db.close()
def _main(args): raise NotImplementedError( 'Variant index creation not currently implemented.') logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix) LOGGER.debug('Command: """' + ' '.join(sys.argv) + '"""') vars_db_fn = mh.get_megalodon_fn(args.megalodon_directory, mh.PR_VAR_NAME) vars_db = variants.VarsDb(vars_db_fn, read_only=False) try: vars_db.check_data_covering_index_exists() LOGGER.info('Variants database index already exists') except mh.MegaError: LOGGER.info('Creating variants database index') vars_db.create_data_covering_index() LOGGER.debug('Closing database') vars_db.close()
def _main(args): logging.init_logger(args.megalodon_directory, out_suffix=args.output_suffix) # parse motifs motifs = parse_motifs(args.motif) # open indexed FASTA reference ref = pysam.FastaFile(args.reference) LOGGER.info('Extracting mods and chrms from input database') in_mods_db = mods.ModsDb( mh.get_megalodon_fn(args.megalodon_directory, mh.PR_MOD_NAME)) alphabet, _, mod_long_names = in_mods_db.get_alphabet_info() ref_names_and_lens = list(zip(*in_mods_db.iter_chrms()))[1:] LOGGER.info('Extracting read uuid table') in_uuids = [uuid for _, uuid in in_mods_db.iter_uuids()] LOGGER.info('Opening new per-read modified base statistics databases') model_info = backends.DetachedModelInfo(alphabet=alphabet, mod_long_names=mod_long_names) out_mods_dbs = [] for motif_info in motifs: out_dir = '{}.{}_{}'.format(args.output_prefix, motif_info.raw_motif, motif_info.bases_before) mh.mkdir(out_dir, overwrite=False) mods_info = mods.ModInfo(model_info, out_dir=out_dir) mods.init_mods_db(mods_info, ref_names_and_lens) out_mods_dbs.append((mods.ModsDb(mods_info.mods_db_fn, read_only=False), motif_info)) out_mods_dbs[-1][0].insert_uuids(in_uuids) out_mods_dbs[-1][0].commit() # commit so read uuids are available to worker processes LOGGER.info('Inserting per-read calls from input databases') split_data(in_mods_db, out_mods_dbs, ref) # TOOD do this in separate processes LOGGER.info( 'Creating data covering indices for efficient iteration by position') for out_mods_db, _ in out_mods_dbs: out_mods_db.create_data_covering_index() out_mods_db.commit() out_mods_db.close() LOGGER.info('Finished indexing {}'.format(out_mods_db.fn))