def parse_inputs(
        samp1_bm_fns, samp2_bm_fns, strand_offset, samp_names, valid_pos_fn,
        out_fp):
    # parse valid positions
    valid_pos = None
    if valid_pos_fn is not None:
        valid_pos = mh.parse_beds(
            valid_pos_fn, ignore_strand=strand_offset is not None)

    # parse bed methyl files
    samp1_cov, samp1_mod_cov = mh.parse_bed_methyls(
        samp1_bm_fns, strand_offset=strand_offset, valid_pos=valid_pos)
    samp1_all_cov = np.array([cov for ctg_cov in samp1_cov.values()
                              for cov in ctg_cov.values()])
    samp2_cov, samp2_mod_cov = mh.parse_bed_methyls(
        samp2_bm_fns, strand_offset=strand_offset, valid_pos=valid_pos)
    samp2_all_cov = np.array([cov for ctg_cov in samp2_cov.values()
                              for cov in ctg_cov.values()])
    out_fp.write(
        '{} coverage median: {:.2f}   mean: {:.2f}  sd: {:.2f}\n'.format(
            samp_names[0], np.median(samp1_all_cov),
            np.mean(samp1_all_cov), np.std(samp1_all_cov)))
    out_fp.write(
        '{} coverage median: {:.2f}   mean: {:.2f}  sd: {:.2f}\n'.format(
            samp_names[1], np.median(samp2_all_cov),
            np.mean(samp2_all_cov), np.std(samp2_all_cov)))

    return (samp1_cov, samp1_mod_cov, samp1_all_cov,
            samp2_cov, samp2_mod_cov, samp2_all_cov)
def parse_inputs(samp1_bm_fns, samp2_bm_fns, strand_offset, samp_names,
                 valid_pos_fn, out_fp):
    # parse valid positions
    valid_pos = None
    if valid_pos_fn is not None:
        LOGGER.info("Parsing valid sites bed")
        valid_pos = mh.parse_beds(
            valid_pos_fn,
            ignore_strand=strand_offset is not None,
            show_prog_bar=False,
        )

    # parse bed methyl files
    LOGGER.info("Parsing bedmethyl files")
    samp1_cov, samp1_mod_cov = mh.parse_bed_methyls(
        samp1_bm_fns,
        strand_offset=strand_offset,
        valid_pos=valid_pos,
        show_prog_bar=False,
    )
    samp1_all_cov = np.array(
        [cov for ctg_cov in samp1_cov.values() for cov in ctg_cov.values()])
    samp2_cov, samp2_mod_cov = mh.parse_bed_methyls(
        samp2_bm_fns,
        strand_offset=strand_offset,
        valid_pos=valid_pos,
        show_prog_bar=False,
    )
    samp2_all_cov = np.array(
        [cov for ctg_cov in samp2_cov.values() for cov in ctg_cov.values()])
    out_fp.write(
        "{} coverage median: {:.2f}   mean: {:.2f}  sd: {:.2f}\n".format(
            samp_names[0],
            np.median(samp1_all_cov),
            np.mean(samp1_all_cov),
            np.std(samp1_all_cov),
        ))
    out_fp.write(
        "{} coverage median: {:.2f}   mean: {:.2f}  sd: {:.2f}\n".format(
            samp_names[1],
            np.median(samp2_all_cov),
            np.mean(samp2_all_cov),
            np.std(samp2_all_cov),
        ))

    return (
        samp1_cov,
        samp1_mod_cov,
        samp1_all_cov,
        samp2_cov,
        samp2_mod_cov,
        samp2_all_cov,
    )
예제 #3
0
def _main(args):
    samp_cov, samp_mod_cov = mh.parse_bed_methyls(
        args.bed_methyl_files, strand_offset=args.strand_offset)
    with open(args.out_csv, 'w') as gt_fp:
        for (chrom, strand), ctg_cov in samp_cov.items():
            for pos, cov in ctg_cov.items():
                if cov < args.coverage_threshold:
                    continue
                pct_mod = 100 * samp_mod_cov[(chrom, strand)][pos] / cov
                if pct_mod <= args.pct_mod_thresholds[0]:
                    gt_fp.write(','.join(
                        map(str, (chrom, mh.int_strand_to_str(strand), pos,
                                  'False'))) + '\n')
                    if args.strand_offset is not None:
                        gt_fp.write(','.join(
                            map(str, (chrom, mh.int_strand_to_str(strand),
                                      pos + args.strand_offset, 'False'))) +
                                    '\n')
                elif pct_mod >= args.pct_mod_thresholds[1]:
                    gt_fp.write(','.join(
                        map(str, (chrom, mh.int_strand_to_str(strand), pos,
                                  'True'))) + '\n')
                    if args.strand_offset is not None:
                        gt_fp.write(','.join(
                            map(str, (chrom, mh.int_strand_to_str(strand),
                                      pos + args.strand_offset, 'True'))) +
                                    '\n')
예제 #4
0
def check_matching_attrs(ground_truth_bed,
                         strand_offset,
                         mod_db_fn,
                         target_mod_bases,
                         limit=10000):
    mods_db = mods.ModsDb(mod_db_fn)
    db_strands = (1, -1) if strand_offset is None else (None, )
    db_chrms = set((chrm, strand) for _, chrm, _ in mods_db.iter_chrms()
                   for strand in db_strands)
    cov, mod_cov = mh.parse_bed_methyls([
        ground_truth_bed,
    ],
                                        strand_offset,
                                        show_prog_bar=False,
                                        limit=limit)
    if len(db_chrms.intersection(cov.keys())) == 0:
        LOGGER.error(('Using first {} sites from {}, found zero overlapping ' +
                      'contig/chromosome names with the mod database.').format(
                          limit, ground_truth_bed))
        LOGGER.info('Database contigs/chromosomes: {}'.format(', '.join(
            map(str, db_chrms))))
        LOGGER.info('BED methyl contigs/chromosomes: {}'.format(', '.join(
            map(str, list(cov.keys())))))
        raise mh.MegaError('No overlapping contigs found.')
    db_mods = set(mod_base for mod_base, _ in mods_db.get_mod_long_names())
    for tmb in target_mod_bases:
        if tmb not in db_mods:
            raise mh.MegaError(
                ('Target modified base, {}, not found in mods database ' +
                 '({}).').format(tmb, ', '.join(db_mods)))
    mods_db.check_data_covering_index_exists()
    mods_db.close()
예제 #5
0
def write_unsorted_merge(in_fns, out_fp, bar):
    cov, mod_cov = mh.parse_bed_methyls(in_fns)
    for chrm in sorted(
            mh.RefName(chrm) for chrm in set(chrm for chrm, _ in cov)):
        # convert back to string after sorting
        chrm = str(chrm)
        s_poss = []
        if (chrm, 1) in cov:
            s_poss.extend([(pos, 1) for pos in cov[(chrm, 1)]])
        if (chrm, -1) in cov:
            s_poss.extend([(pos, -1) for pos in cov[(chrm, -1)]])
        for pos, strand in sorted(s_poss):
            pcov = cov[(chrm, strand)][pos]
            out_fp.write(
                mods.BEDMETHYL_TMPLT.format(
                    chrom=chrm,
                    pos=pos,
                    end=pos + 1,
                    strand=mh.int_strand_to_str(strand),
                    cov=pcov,
                    score=min(int(pcov), 1000),
                    perc=np.around(mod_cov[(chrm, strand)][pos] / pcov *
                                   100, 1),
                ) + "\n")
            bar.update()
예제 #6
0
def parse_mod_sample(bm_files, strand_offset, cov_thresh, samp_name):
    cov, mod_cov = mh.parse_bed_methyls(bm_files, strand_offset=strand_offset)
    all_cov = np.array(
        [cov for ctg_cov in cov.values() for cov in ctg_cov.values()])
    LOGGER.info('{} coverage median: {:.2f}   mean: {:.2f}'.format(
        samp_name, np.median(all_cov), np.mean(all_cov)))
    test_sites = {}
    for ctg in mod_cov:
        test_sites[ctg] = set(pos for pos, cov in cov[ctg].items()
                              if cov >= cov_thresh)
    return MOD_SAMPLE(cov, mod_cov, test_sites)