예제 #1
0
def region_p(fpvals, fregions, col_num, nsims, step, mlog=False, z=False):
    # just use 2 for col_num, but dont need the p from regions.

    if(sum(1 for _ in open(fregions) if _[0] != "#") == 0):
        print >>sys.stderr, "no regions in %s" % (fregions, )
        sys.exit()

    process, total_coverage_sync = _get_total_coverage(fpvals, col_num, step)
    region_info = _get_ps_in_regions(fregions, fpvals, col_num)

    acfs = _gen_acf(region_info, (fpvals,), col_num, step, mlog=mlog)
    process.join()
    total_coverage = total_coverage_sync.value

    # regions first and then create ACF for the longest one.
    print >>sys.stderr, "%i bases used as coverage for sidak correction" % \
                                (total_coverage)
    sample_distribution = np.array([b["p"] for b in bediter(fpvals,
                                                                col_num)])
    for region_line, region_len, prows in region_info:
        # gen_sigma expects a list of bed dicts.
        sigma = gen_sigma_matrix(prows, acfs)
        ps = np.array([prow["p"] for prow in prows])
        if ps.shape[0] == 0:
            print >>sys.stderr,("bad region", region_line)
            continue

        # calculate the SLK for the region.

        if z:
            region_slk = z_score_combine(ps, sigma)
        else:
            region_slk = stouffer_liptak(ps, sigma)

        if not region_slk["OK"]:
            print >>sys.stderr, "problem with:", region_slk, ps

        slk_p = region_slk["p"]

        sidak_slk_p = sidak(slk_p, region_len, total_coverage)

        result = [region_line, slk_p, sidak_slk_p]

        # corroborate those with p-values < 0.1 by simulation
        #"""
        if nsims > 0:

            # adjust nsims so it's an adjusted p-value.
            q_nsims = int(0.5 + total_coverage / float(region_len))
            assert sample_distribution is not None
            # trim sigma because we may have trimmed the ps above.
            sim_p = sl_sim(sigma, ps, q_nsims, sample_distribution)
            result.append(sim_p)
        else:
            result.append("NA")
        #"""
        #result.append("NA")
        yield result
예제 #2
0
def region_p(fpvals, fregions, col_num, step, z=True):
    # just use 2 for col_num, but dont need the p from regions.

    tree = read_regions(fregions)
    process, total_coverage_sync = _get_total_coverage(fpvals, col_num, step)

    region_info = _get_ps_in_regions(tree, fpvals, col_num)

    acfs = _gen_acf(region_info, (fpvals, ), col_num, step)
    process.join()
    total_coverage = total_coverage_sync.value

    # regions first and then create ACF for the longest one.
    print("%i bases used as coverage for sidak correction" % \
                                (total_coverage), file=sys.stderr)
    sample_distribution = np.array([b["p"] for b in bediter(fpvals, col_num)])

    combine = z_score_combine if z else stouffer_liptak
    for region, prows in region_info.items():
        # gen_sigma expects a list of bed dicts.
        sigma = gen_sigma_matrix(prows, acfs)
        ps = np.array([prow["p"] for prow in prows])
        if ps.shape[0] == 0:
            print("bad region", region, file=sys.stderr)
            continue

        # calculate the SLK for the region.
        region_slk = combine(ps, sigma)
        if not region_slk["OK"]:
            print("problem with:", region_slk, ps, file=sys.stderr)

        slk_p = region_slk["p"]

        sidak_slk_p = sidak(slk_p,
                            int(region[2]) - int(region[1]), total_coverage)

        result = ["\t".join(region), slk_p, sidak_slk_p, "NA"]
        yield result
예제 #3
0
def region_p(fpvals, fregions, col_num, step, z=True):
    # just use 2 for col_num, but dont need the p from regions.

    tree = read_regions(fregions)
    process, total_coverage_sync = _get_total_coverage(fpvals, col_num, step)

    region_info = _get_ps_in_regions(tree, fpvals, col_num)

    acfs = _gen_acf(region_info, (fpvals,), col_num, step)
    process.join()
    total_coverage = total_coverage_sync.value

    # regions first and then create ACF for the longest one.
    print >>sys.stderr, "%i bases used as coverage for sidak correction" % \
                                (total_coverage)
    sample_distribution = np.array([b["p"] for b in bediter(fpvals,
                                                                col_num)])

    combine = z_score_combine if z else stouffer_liptak
    for region, prows in region_info.iteritems():
        # gen_sigma expects a list of bed dicts.
        sigma = gen_sigma_matrix(prows, acfs)
        ps = np.array([prow["p"] for prow in prows])
        if ps.shape[0] == 0:
            print >>sys.stderr,("bad region", region)
            continue

        # calculate the SLK for the region.
        region_slk = combine(ps, sigma)
        if not region_slk["OK"]:
            print >>sys.stderr, "problem with:", region_slk, ps

        slk_p = region_slk["p"]

        sidak_slk_p = sidak(slk_p, int(region[2]) - int(region[1]), total_coverage)

        result = ["\t".join(region), slk_p, sidak_slk_p, "NA"]
        yield result