def parse_inputs( samp1_bm_fns, samp2_bm_fns, strand_offset, samp_names, valid_pos_fn, out_fp): # parse valid positions valid_pos = None if valid_pos_fn is not None: valid_pos = mh.parse_beds( valid_pos_fn, ignore_strand=strand_offset is not None) # parse bed methyl files samp1_cov, samp1_mod_cov = mh.parse_bed_methyls( samp1_bm_fns, strand_offset=strand_offset, valid_pos=valid_pos) samp1_all_cov = np.array([cov for ctg_cov in samp1_cov.values() for cov in ctg_cov.values()]) samp2_cov, samp2_mod_cov = mh.parse_bed_methyls( samp2_bm_fns, strand_offset=strand_offset, valid_pos=valid_pos) samp2_all_cov = np.array([cov for ctg_cov in samp2_cov.values() for cov in ctg_cov.values()]) out_fp.write( '{} coverage median: {:.2f} mean: {:.2f} sd: {:.2f}\n'.format( samp_names[0], np.median(samp1_all_cov), np.mean(samp1_all_cov), np.std(samp1_all_cov))) out_fp.write( '{} coverage median: {:.2f} mean: {:.2f} sd: {:.2f}\n'.format( samp_names[1], np.median(samp2_all_cov), np.mean(samp2_all_cov), np.std(samp2_all_cov))) return (samp1_cov, samp1_mod_cov, samp1_all_cov, samp2_cov, samp2_mod_cov, samp2_all_cov)
def parse_inputs(samp1_bm_fns, samp2_bm_fns, strand_offset, samp_names, valid_pos_fn, out_fp): # parse valid positions valid_pos = None if valid_pos_fn is not None: LOGGER.info("Parsing valid sites bed") valid_pos = mh.parse_beds( valid_pos_fn, ignore_strand=strand_offset is not None, show_prog_bar=False, ) # parse bed methyl files LOGGER.info("Parsing bedmethyl files") samp1_cov, samp1_mod_cov = mh.parse_bed_methyls( samp1_bm_fns, strand_offset=strand_offset, valid_pos=valid_pos, show_prog_bar=False, ) samp1_all_cov = np.array( [cov for ctg_cov in samp1_cov.values() for cov in ctg_cov.values()]) samp2_cov, samp2_mod_cov = mh.parse_bed_methyls( samp2_bm_fns, strand_offset=strand_offset, valid_pos=valid_pos, show_prog_bar=False, ) samp2_all_cov = np.array( [cov for ctg_cov in samp2_cov.values() for cov in ctg_cov.values()]) out_fp.write( "{} coverage median: {:.2f} mean: {:.2f} sd: {:.2f}\n".format( samp_names[0], np.median(samp1_all_cov), np.mean(samp1_all_cov), np.std(samp1_all_cov), )) out_fp.write( "{} coverage median: {:.2f} mean: {:.2f} sd: {:.2f}\n".format( samp_names[1], np.median(samp2_all_cov), np.mean(samp2_all_cov), np.std(samp2_all_cov), )) return ( samp1_cov, samp1_mod_cov, samp1_all_cov, samp2_cov, samp2_mod_cov, samp2_all_cov, )
def _main(args): samp_cov, samp_mod_cov = mh.parse_bed_methyls( args.bed_methyl_files, strand_offset=args.strand_offset) with open(args.out_csv, 'w') as gt_fp: for (chrom, strand), ctg_cov in samp_cov.items(): for pos, cov in ctg_cov.items(): if cov < args.coverage_threshold: continue pct_mod = 100 * samp_mod_cov[(chrom, strand)][pos] / cov if pct_mod <= args.pct_mod_thresholds[0]: gt_fp.write(','.join( map(str, (chrom, mh.int_strand_to_str(strand), pos, 'False'))) + '\n') if args.strand_offset is not None: gt_fp.write(','.join( map(str, (chrom, mh.int_strand_to_str(strand), pos + args.strand_offset, 'False'))) + '\n') elif pct_mod >= args.pct_mod_thresholds[1]: gt_fp.write(','.join( map(str, (chrom, mh.int_strand_to_str(strand), pos, 'True'))) + '\n') if args.strand_offset is not None: gt_fp.write(','.join( map(str, (chrom, mh.int_strand_to_str(strand), pos + args.strand_offset, 'True'))) + '\n')
def check_matching_attrs(ground_truth_bed, strand_offset, mod_db_fn, target_mod_bases, limit=10000): mods_db = mods.ModsDb(mod_db_fn) db_strands = (1, -1) if strand_offset is None else (None, ) db_chrms = set((chrm, strand) for _, chrm, _ in mods_db.iter_chrms() for strand in db_strands) cov, mod_cov = mh.parse_bed_methyls([ ground_truth_bed, ], strand_offset, show_prog_bar=False, limit=limit) if len(db_chrms.intersection(cov.keys())) == 0: LOGGER.error(('Using first {} sites from {}, found zero overlapping ' + 'contig/chromosome names with the mod database.').format( limit, ground_truth_bed)) LOGGER.info('Database contigs/chromosomes: {}'.format(', '.join( map(str, db_chrms)))) LOGGER.info('BED methyl contigs/chromosomes: {}'.format(', '.join( map(str, list(cov.keys()))))) raise mh.MegaError('No overlapping contigs found.') db_mods = set(mod_base for mod_base, _ in mods_db.get_mod_long_names()) for tmb in target_mod_bases: if tmb not in db_mods: raise mh.MegaError( ('Target modified base, {}, not found in mods database ' + '({}).').format(tmb, ', '.join(db_mods))) mods_db.check_data_covering_index_exists() mods_db.close()
def write_unsorted_merge(in_fns, out_fp, bar): cov, mod_cov = mh.parse_bed_methyls(in_fns) for chrm in sorted( mh.RefName(chrm) for chrm in set(chrm for chrm, _ in cov)): # convert back to string after sorting chrm = str(chrm) s_poss = [] if (chrm, 1) in cov: s_poss.extend([(pos, 1) for pos in cov[(chrm, 1)]]) if (chrm, -1) in cov: s_poss.extend([(pos, -1) for pos in cov[(chrm, -1)]]) for pos, strand in sorted(s_poss): pcov = cov[(chrm, strand)][pos] out_fp.write( mods.BEDMETHYL_TMPLT.format( chrom=chrm, pos=pos, end=pos + 1, strand=mh.int_strand_to_str(strand), cov=pcov, score=min(int(pcov), 1000), perc=np.around(mod_cov[(chrm, strand)][pos] / pcov * 100, 1), ) + "\n") bar.update()
def parse_mod_sample(bm_files, strand_offset, cov_thresh, samp_name): cov, mod_cov = mh.parse_bed_methyls(bm_files, strand_offset=strand_offset) all_cov = np.array( [cov for ctg_cov in cov.values() for cov in ctg_cov.values()]) LOGGER.info('{} coverage median: {:.2f} mean: {:.2f}'.format( samp_name, np.median(all_cov), np.mean(all_cov))) test_sites = {} for ctg in mod_cov: test_sites[ctg] = set(pos for pos, cov in cov[ctg].items() if cov >= cov_thresh) return MOD_SAMPLE(cov, mod_cov, test_sites)