def parse_inputs(
        samp1_bm_fns, samp2_bm_fns, strand_offset, samp_names, valid_pos_fn,
        out_fp):
    # parse valid positions
    valid_pos = None
    if valid_pos_fn is not None:
        valid_pos = mh.parse_beds(
            valid_pos_fn, ignore_strand=strand_offset is not None)

    # parse bed methyl files
    samp1_cov, samp1_mod_cov = mh.parse_bed_methyls(
        samp1_bm_fns, strand_offset=strand_offset, valid_pos=valid_pos)
    samp1_all_cov = np.array([cov for ctg_cov in samp1_cov.values()
                              for cov in ctg_cov.values()])
    samp2_cov, samp2_mod_cov = mh.parse_bed_methyls(
        samp2_bm_fns, strand_offset=strand_offset, valid_pos=valid_pos)
    samp2_all_cov = np.array([cov for ctg_cov in samp2_cov.values()
                              for cov in ctg_cov.values()])
    out_fp.write(
        '{} coverage median: {:.2f}   mean: {:.2f}  sd: {:.2f}\n'.format(
            samp_names[0], np.median(samp1_all_cov),
            np.mean(samp1_all_cov), np.std(samp1_all_cov)))
    out_fp.write(
        '{} coverage median: {:.2f}   mean: {:.2f}  sd: {:.2f}\n'.format(
            samp_names[1], np.median(samp2_all_cov),
            np.mean(samp2_all_cov), np.std(samp2_all_cov)))

    return (samp1_cov, samp1_mod_cov, samp1_all_cov,
            samp2_cov, samp2_mod_cov, samp2_all_cov)
def parse_inputs(samp1_bm_fns, samp2_bm_fns, strand_offset, samp_names,
                 valid_pos_fn, out_fp):
    # parse valid positions
    valid_pos = None
    if valid_pos_fn is not None:
        LOGGER.info("Parsing valid sites bed")
        valid_pos = mh.parse_beds(
            valid_pos_fn,
            ignore_strand=strand_offset is not None,
            show_prog_bar=False,
        )

    # parse bed methyl files
    LOGGER.info("Parsing bedmethyl files")
    samp1_cov, samp1_mod_cov = mh.parse_bed_methyls(
        samp1_bm_fns,
        strand_offset=strand_offset,
        valid_pos=valid_pos,
        show_prog_bar=False,
    )
    samp1_all_cov = np.array(
        [cov for ctg_cov in samp1_cov.values() for cov in ctg_cov.values()])
    samp2_cov, samp2_mod_cov = mh.parse_bed_methyls(
        samp2_bm_fns,
        strand_offset=strand_offset,
        valid_pos=valid_pos,
        show_prog_bar=False,
    )
    samp2_all_cov = np.array(
        [cov for ctg_cov in samp2_cov.values() for cov in ctg_cov.values()])
    out_fp.write(
        "{} coverage median: {:.2f}   mean: {:.2f}  sd: {:.2f}\n".format(
            samp_names[0],
            np.median(samp1_all_cov),
            np.mean(samp1_all_cov),
            np.std(samp1_all_cov),
        ))
    out_fp.write(
        "{} coverage median: {:.2f}   mean: {:.2f}  sd: {:.2f}\n".format(
            samp_names[1],
            np.median(samp2_all_cov),
            np.mean(samp2_all_cov),
            np.std(samp2_all_cov),
        ))

    return (
        samp1_cov,
        samp1_mod_cov,
        samp1_all_cov,
        samp2_cov,
        samp2_mod_cov,
        samp2_all_cov,
    )
Пример #3
0
def parse_valid_sites(valid_sites_fns, gt_data_fn, include_strand):
    if valid_sites_fns is None and gt_data_fn is None:
        return None, None, None

    # if ground truth file provided, parse first
    if gt_data_fn is not None:
        LOGGER.info("Reading ground truth file")
        gt_mod_pos, gt_ctrl_pos = mh.parse_ground_truth_file(
            gt_data_fn, include_strand=include_strand)
        if valid_sites_fns is None:
            # if ground truth provided, but not valid sites return parsed
            # ground truth sites.
            return (
                [
                    gt_mod_pos,
                ],
                None,
                [
                    gt_ctrl_pos,
                ],
            )

    # parse valid sites files and intersect with ground truth (if provided)
    LOGGER.info("Reading valid sites data")
    valid_sites, vs_labs = [], []
    ctrl_sites = None if gt_data_fn is None else []
    for vs_lab, valid_sites_fn in valid_sites_fns:
        try:
            vs_i_sites = mh.parse_beds([
                valid_sites_fn,
            ])
        except FileNotFoundError:
            LOGGER.warning(
                "Could not find valid sites file: {}".format(valid_sites_fn))
            continue

        vs_i_sites = set(((chrm, strand, pos) if include_strand else
                          (chrm, pos)
                          for (chrm, strand), cs_pos in vs_i_sites.items()
                          for pos in cs_pos))
        if gt_data_fn is None:
            valid_sites.append(vs_i_sites)
        else:
            ctrl_sites.append(vs_i_sites.intersection(gt_ctrl_pos))
            valid_sites.append(vs_i_sites.intersection(gt_mod_pos))
        vs_labs.append(vs_lab)

    if len(valid_sites) == 0:
        return None, None, None

    return valid_sites, vs_labs, ctrl_sites
Пример #4
0
def compute_val_metrics(mod_samp,
                        ctrl_samp,
                        gt_data,
                        out_fp,
                        pdf_fp,
                        balance_classes,
                        ignore_strand,
                        samp_name='sample',
                        valid_pos_fn=None):
    # extract ground truth either from mod and control samples or ground truth
    # data
    if gt_data is None:
        if valid_pos_fn is not None:
            valid_pos = mh.parse_beds([
                valid_pos_fn,
            ],
                                      ignore_strand=ignore_strand)
            mod_samp = mod_samp._replace(test_sites=dict(
                (ctg, valid_pos[ctg].intersection(ctg_sites))
                for ctg, ctg_sites in mod_samp.test_sites.items()
                if ctg in valid_pos))
            ctrl_samp = ctrl_samp._replace(test_sites=dict(
                (ctg, valid_pos[ctg].intersection(ctg_sites))
                for ctg, ctg_sites in ctrl_samp.test_sites.items()
                if ctg in valid_pos))
        mod_pct_mod = np.array([
            100 * mod_samp.mod_cov[ctg][pos] / mod_samp.cov[ctg][pos]
            for ctg, ctg_poss in mod_samp.test_sites.items()
            for pos in ctg_poss
        ])
        ctrl_pct_mod = np.array([
            100 * ctrl_samp.mod_cov[ctg][pos] / ctrl_samp.cov[ctg][pos]
            for ctg, ctg_poss in ctrl_samp.test_sites.items()
            for pos in ctg_poss
        ])
    else:
        mod_pct_mod, ctrl_pct_mod = [], []
        for ctg, pos_is_mod in gt_data.items():
            try:
                ctg_cov = mod_samp.cov[ctg]
                ctg_mod_cov = mod_samp.mod_cov[ctg]
            except KeyError:
                continue
            for pos, is_mod in pos_is_mod:
                try:
                    pos_cov = ctg_cov[pos]
                    pos_mod_cov = ctg_mod_cov[pos]
                except KeyError:
                    continue
                if is_mod:
                    mod_pct_mod.append(100 * pos_mod_cov / pos_cov)
                else:
                    ctrl_pct_mod.append(100 * pos_mod_cov / pos_cov)
        mod_pct_mod = np.array(mod_pct_mod)
        ctrl_pct_mod = np.array(ctrl_pct_mod)

    if balance_classes:
        if mod_pct_mod.shape[0] > ctrl_pct_mod.shape[0]:
            mod_pct_mod = np.random.choice(mod_pct_mod,
                                           ctrl_pct_mod.shape[0],
                                           replace=False)
        elif mod_pct_mod.shape[0] < ctrl_pct_mod.shape[0]:
            ctrl_pct_mod = np.random.choice(ctrl_pct_mod,
                                            mod_pct_mod.shape[0],
                                            replace=False)
    all_pct_mod = np.concatenate([mod_pct_mod, ctrl_pct_mod])
    if all_pct_mod.shape[0] == 0:
        LOGGER.info(
            'Skipping "{}". No vaild sites available.'.format(samp_name))
        return
    is_mod = np.repeat((1, 0), (mod_pct_mod.shape[0], ctrl_pct_mod.shape[0]))

    precision, recall, thresh = precision_recall_curve(is_mod, all_pct_mod)
    prec_recall_sum = precision + recall
    valid_idx = np.where(prec_recall_sum > 0)
    all_f1 = (2 * precision[valid_idx] * recall[valid_idx] /
              prec_recall_sum[valid_idx])
    optim_f1_idx = np.argmax(all_f1)
    optim_f1 = all_f1[optim_f1_idx]
    optim_thresh = thresh[optim_f1_idx]
    avg_prcn = average_precision_score(is_mod, all_pct_mod)

    fpr, tpr, _ = roc_curve(is_mod, all_pct_mod)
    roc_auc = auc(fpr, tpr)

    out_fp.write(
        MOD_VAL_METRICS_TMPLT.format(optim_f1, optim_thresh, avg_prcn, roc_auc,
                                     mod_pct_mod.shape[0],
                                     ctrl_pct_mod.shape[0], samp_name))

    LOGGER.info('Plotting {}'.format(samp_name))
    plt.figure(figsize=(11, 7))
    try:
        sns.kdeplot(mod_pct_mod,
                    shade=True,
                    bw_adjust=MOD_BANDWIDTH,
                    gridsize=MOD_GRIDSIZE,
                    label='Yes')
        sns.kdeplot(ctrl_pct_mod,
                    shade=True,
                    bw_adjust=MOD_BANDWIDTH,
                    gridsize=MOD_GRIDSIZE,
                    label='No')
    except AttributeError:
        sns.kdeplot(mod_pct_mod,
                    shade=True,
                    bw=MOD_BANDWIDTH2,
                    gridsize=MOD_GRIDSIZE,
                    label='Yes')
        sns.kdeplot(ctrl_pct_mod,
                    shade=True,
                    bw=MOD_BANDWIDTH2,
                    gridsize=MOD_GRIDSIZE,
                    label='No')
    plt.legend(prop={'size': 16}, title='Is Modified?')
    plt.xlabel('Percent Modified')
    plt.ylabel('Density')
    plt.title(samp_name)
    pdf_fp.savefig(bbox_inches='tight')
    plt.close()

    plt.figure(figsize=(8, 7))
    plt.step(recall, precision, where='post')
    plt.ylim([-0.05, 1.05])
    plt.xlim([-0.05, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(('{}   Precision-Recall curve: AP={:0.2f}').format(
        samp_name, avg_prcn))
    pdf_fp.savefig(bbox_inches='tight')
    plt.close()

    plt.figure(figsize=(8, 7))
    plt.plot(fpr, tpr)
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(('{}   ROC curve: auc={:0.2f}').format(samp_name, roc_auc))
    pdf_fp.savefig(bbox_inches='tight')
    plt.close()