def _qvality(fbed_file, col_num, col_null): from qvality import qvality ps = [b['p'] for b in bediter(fbed_file, col_num)] nulls = [b['p'] for b in bediter(fbed_file, col_null)] fh = ts.nopen(fbed_file) drop_header(fh) for (pval, pep, qval), l in izip(qvality(ps, nulls, r=None), fh): yield qval, pep, l
def _qvality(fbed_file, col_num, col_null): from qvality import qvality ps = [b['p'] for b in bediter(fbed_file, col_num)] nulls = [b['p'] for b in bediter(fbed_file, col_null)] fh = ts.nopen(fbed_file) drop_header(fh) for (pval, pep, qval), l in izip(qvality(ps, nulls, r=None), fh): yield qval, pep, l
def obs_fdr(fbed_file, col_num, col_null=None): ps = [b['p'] for b in bediter(fbed_file, col_num)] if col_null is None: # Benjamini-Hochberg. nulls = np.arange(1, len(ps) + 1, dtype=np.float64) / float(len(ps)) else: nulls = [b['p'] for b in bediter(fbed_file, col_null)] fh = ts.nopen(fbed_file) drop_header(fh) for qval, l in izip(relative_fdr(ps, nulls), fh): yield qval, l
def obs_fdr(fbed_file, col_num, col_null=None): ps = [b['p'] for b in bediter(fbed_file, col_num)] if col_null is None: # Benjamini-Hochberg. nulls = np.arange(1, len(ps) + 1, dtype=np.float64) / float(len(ps)) else: nulls = [b['p'] for b in bediter(fbed_file, col_null)] fh = ts.nopen(fbed_file) drop_header(fh) for qval, l in izip(relative_fdr(ps, nulls), fh): yield qval, l
def _get_ps_in_regions(fregions, fpvals, col_num): """ find the pvalues associated with each region """ region_info = [] piter = chain(bediter(fpvals, col_num), [None]) prow = piter.next() nr = 0 for rchrom, rstart, rend, region_line in sorted(gen_regions(fregions), key=itemgetter(0, 1)): prows = [] nr += 1 # grab the p-values in the bed file that are within the current region while (prow["chrom"] != rchrom or prow["start"] < rstart): #while (prow["chrom"] != rchrom or prow["end"] < rstart): prow = piter.next() if prow is None: break #while prow is not None and (rchrom, rend) > (prow["chrom"], prow["start"]): while prow is not None and (rchrom, rend) >= (prow["chrom"], prow["end"]): prows.append(prow) prow = piter.next() if prow is None: break if not prows: print >>sys.stderr, "missed,:", prows, (region_line) region_len = rend - rstart region_info.append((region_line, region_len, prows[:])) del prows assert nr == len(region_info), (nr, len(region_info)) return region_info
def region_p(fpvals, fregions, col_num, nsims, step, mlog=False, z=False): # just use 2 for col_num, but dont need the p from regions. if(sum(1 for _ in open(fregions) if _[0] != "#") == 0): print >>sys.stderr, "no regions in %s" % (fregions, ) sys.exit() process, total_coverage_sync = _get_total_coverage(fpvals, col_num, step) region_info = _get_ps_in_regions(fregions, fpvals, col_num) acfs = _gen_acf(region_info, (fpvals,), col_num, step, mlog=mlog) process.join() total_coverage = total_coverage_sync.value # regions first and then create ACF for the longest one. print >>sys.stderr, "%i bases used as coverage for sidak correction" % \ (total_coverage) sample_distribution = np.array([b["p"] for b in bediter(fpvals, col_num)]) for region_line, region_len, prows in region_info: # gen_sigma expects a list of bed dicts. sigma = gen_sigma_matrix(prows, acfs) ps = np.array([prow["p"] for prow in prows]) if ps.shape[0] == 0: print >>sys.stderr,("bad region", region_line) continue # calculate the SLK for the region. if z: region_slk = z_score_combine(ps, sigma) else: region_slk = stouffer_liptak(ps, sigma) if not region_slk["OK"]: print >>sys.stderr, "problem with:", region_slk, ps slk_p = region_slk["p"] sidak_slk_p = sidak(slk_p, region_len, total_coverage) result = [region_line, slk_p, sidak_slk_p] # corroborate those with p-values < 0.1 by simulation #""" if nsims > 0: # adjust nsims so it's an adjusted p-value. q_nsims = int(0.5 + total_coverage / float(region_len)) assert sample_distribution is not None # trim sigma because we may have trimmed the ps above. sim_p = sl_sim(sigma, ps, q_nsims, sample_distribution) result.append(sim_p) else: result.append("NA") #""" #result.append("NA") yield result
def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=True): """ calculate the correlation of the numbers in `col_num0` from the bed files in `fnames` at various lags. The lags are specified by distance. Partial autocorrelation may be calculated as well. Since the bed files may be very large, this attempts to be as memory efficient as possible while still being very fast for a pure python implementation. """ # reversing allows optimization below. imap = get_map() arg_list = [] # chaining for fname in fnames: # groupby chromosome. arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \ chromlist in \ groupby(bediter(fname, col_num0), lambda a: a["chrom"]))) unmerged_acfs = [] # separated by chrom. need to merge later. for chrom_acf in imap(_acf_by_chrom, arg_list): unmerged_acfs.append(chrom_acf) acfs = merge_acfs(unmerged_acfs) acf_res = {} xs = np.array([], dtype='f') ys = np.array([], dtype='f') # iterate over it backwards and remove to reduce memory. while len(acfs): lmin, lmax, xys = acfs.pop() if partial: xs, ys = np.array(xys["x"]), np.array(xys["y"]) else: # add the inner layers as we move out. xs = np.hstack((xs, xys["x"])) ys = np.hstack((ys, xys["y"])) if len(xs) == 0: print >>sys.stderr, "no values found at lag: %i-%i. skipping" \ % (lmin, lmax) continue if mlog: xs[xs == 0] = 1e-12 ys[ys == 0] = 1e-12 xs, ys = -np.log10(xs), -np.log10(ys) #slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys) # NOTE: using pearson correlation, which assumes normality. # could switch to spearman as below. corr, p_val = ss.spearmanr(xs, ys) if simple: acf_res[(lmin, lmax)] = corr else: acf_res[(lmin, lmax)] = (corr, len(xs), p_val) return sorted(acf_res.items())
def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=False): """ calculate the correlation of the numbers in `col_num0` from the bed files in `fnames` at various lags. The lags are specified by distance. Partial autocorrelation may be calculated as well. Since the bed files may be very large, this attempts to be as memory efficient as possible while still being very fast for a pure python implementation. """ # reversing allows optimization below. imap = get_map() arg_list = [] # chaining for fname in fnames: # groupby chromosome. arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \ chromlist in \ groupby(bediter(fname, col_num0), lambda a: a["chrom"]))) unmerged_acfs = [] # separated by chrom. need to merge later. for chrom_acf in imap(_acf_by_chrom, arg_list): unmerged_acfs.append(chrom_acf) acfs = merge_acfs(unmerged_acfs) acf_res = {} xs = np.array([], dtype='f') ys = np.array([], dtype='f') # iterate over it backwards and remove to reduce memory. while len(acfs): lmin, lmax, xys = acfs.pop() if partial: xs, ys = np.array(xys["x"]), np.array(xys["y"]) else: # add the inner layers as we move out. xs = np.hstack((xs, xys["x"])) ys = np.hstack((ys, xys["y"])) if len(xs) == 0: print >>sys.stderr, "no values found at lag: %i-%i. skipping" \ % (lmin, lmax) continue if mlog: xs[xs == 0] = 0.5 * xs[xs > 0].min() ys[ys == 0] = 0.5 * ys[ys > 0].min() xs, ys = -np.log10(xs), -np.log10(ys) slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys) # NOTE: using pearson correlation, which assumes normality. # could switch to spearman as below. #corr, p_val = ss.spearmanr(xs, ys) if simple: acf_res[(lmin, lmax)] = corr else: acf_res[(lmin, lmax)] = (corr, len(xs), p_val) return sorted(acf_res.items())
def get_total_coverage(fpvals, col_num, out_val): """ Calculate total bases of coverage in `fpvals`. Used for the sidak correction """ total_coverage = 0 for key, chrom_iter in groupby(bediter(fpvals, col_num), itemgetter('chrom')): bases = set([]) for feat in chrom_iter: bases.update(range(feat['start'], feat['end'])) total_coverage += len(bases) out_val.value = total_coverage
def _get_ps_in_regions(tree, fpvals, col_num): """ find the pvalues associated with each region """ region_info = defaultdict(list) for row in bediter(fpvals, col_num): for region in tree[row['chrom']].find((row['start'], row['end'])): region_len = max(1, region[1] - region[0]) region_tup = tuple(region[-1]) region_info[region_tup].append(row) assert sum(len(v) for v in tree.values()) >= len(region_info) if sum(len(v) for v in tree.values()) > len(region_info): sys.stderr.write("# note: not all regions contained measurements\n") return region_info
def _get_ps_in_regions(tree, fpvals, col_num): """ find the pvalues associated with each region """ region_info = defaultdict(list) for row in bediter(fpvals, col_num): for region in tree[row['chrom']].find((row['start'], row['end'])): region_len = max(1, region[1] - region[0]) region_tup = tuple(region[-1]) region_info[region_tup].append(row) assert sum(len(v) for v in tree.values()) >= len(region_info) if sum(len(v) for v in tree.values()) > len(region_info): sys.stderr.write("# note: not all regions contained measurements\n") return region_info
def adjust_pvals(fnames, col_num0, acfs, z=True): lag_max = acfs[-1][0][1] # parallelize if multiprocesing is installed. imap = get_map() arg_iter = [] for fname in fnames: # 9e-17 seems to be limit of precision for cholesky. arg_iter = chain(arg_iter, ((list(chromlist), lag_max, acfs, z) \ for key, chromlist in groupby(bediter(fname, col_num0, 9e-117), itemgetter("chrom")))) for chrom, results in imap(_slk_chrom, arg_iter): yield chrom, results
def stepsize(bed_files, col): D1 = [] for bed_file in bed_files: for _, chromlist in groupby(bediter(bed_file, col), itemgetter('chrom')): L = list(chromlist) last_start = 0 for i, ibed in enumerate(L): assert ibed['start'] >= last_start # look around ibed. nearest could be up or down-stream if i + 2 == len(L): break D1.append(L[i + 1]['start'] - ibed['start']) # round up to the nearest 10 return int(round(np.median(D1) + 5, -1))
def adjust_pvals(fnames, col_num0, acfs, stringent=False): lag_max = acfs[-1][0][1] # parallelize if multiprocesing is installed. imap = get_map() arg_iter = [] for fname in fnames: # 9e-17 seems to be limit of precision for cholesky. arg_iter = chain(arg_iter, ((list(chromlist), lag_max, acfs, stringent) \ for key, chromlist in groupby(bediter(fname, col_num0, 9e-17), itemgetter("chrom")))) for results in imap(_slk_chrom, arg_iter): for r in results: yield r
def stepsize(bed_files, col): D1 = [] for bed_file in bed_files: for _, chromlist in groupby(bediter(bed_file, col), itemgetter('chrom')): L = list(chromlist) last_start = 0 for i, ibed in enumerate(L): assert ibed['start'] >= last_start # look around ibed. nearest could be up or down-stream if i + 2 == len(L): break D1.append(L[i + 1]['start'] - ibed['start']) # round up to the nearest 10 return int(round(np.median(D1) + 5, -1))
def get_total_coverage(fpvals, col_num, step, out_val): """ Calculate total bases of coverage in `fpvals`. Used for the sidak correction """ total_coverage = 0 for key, chrom_iter in groupby(bediter(fpvals, col_num), itemgetter('chrom')): bases = set([]) for feat in chrom_iter: s, e = feat['start'], feat['end'] if s == e: e += 1 #e = max(e, s + step) bases.update(range(s, e)) total_coverage += len(bases) out_val.value = total_coverage
def adjust_pvals(fnames, col_num0, acfs, stringent=False): lag_max = acfs[-1][0][1] # parallelize if multiprocesing is installed. try: from multiprocessing import Pool pool = Pool() imap = pool.imap except ImportError: import itertools imap = itertools.imap arg_iter = [] for fname in fnames: # 9e-17 seems to be limit of precision for cholesky. arg_iter = chain(arg_iter, ((list(chromlist), lag_max, acfs, stringent) \ for key, chromlist in groupby(bediter(fname, col_num0, 9e-17), itemgetter("chrom")))) for results in imap(_slk_chrom, arg_iter): for r in results: yield r
def region_p(fpvals, fregions, col_num, step, z=True): # just use 2 for col_num, but dont need the p from regions. tree = read_regions(fregions) process, total_coverage_sync = _get_total_coverage(fpvals, col_num, step) region_info = _get_ps_in_regions(tree, fpvals, col_num) acfs = _gen_acf(region_info, (fpvals, ), col_num, step) process.join() total_coverage = total_coverage_sync.value # regions first and then create ACF for the longest one. print("%i bases used as coverage for sidak correction" % \ (total_coverage), file=sys.stderr) sample_distribution = np.array([b["p"] for b in bediter(fpvals, col_num)]) combine = z_score_combine if z else stouffer_liptak for region, prows in region_info.items(): # gen_sigma expects a list of bed dicts. sigma = gen_sigma_matrix(prows, acfs) ps = np.array([prow["p"] for prow in prows]) if ps.shape[0] == 0: print("bad region", region, file=sys.stderr) continue # calculate the SLK for the region. region_slk = combine(ps, sigma) if not region_slk["OK"]: print("problem with:", region_slk, ps, file=sys.stderr) slk_p = region_slk["p"] sidak_slk_p = sidak(slk_p, int(region[2]) - int(region[1]), total_coverage) result = ["\t".join(region), slk_p, sidak_slk_p, "NA"] yield result
def region_p(fpvals, fregions, col_num, step, z=True): # just use 2 for col_num, but dont need the p from regions. tree = read_regions(fregions) process, total_coverage_sync = _get_total_coverage(fpvals, col_num, step) region_info = _get_ps_in_regions(tree, fpvals, col_num) acfs = _gen_acf(region_info, (fpvals,), col_num, step) process.join() total_coverage = total_coverage_sync.value # regions first and then create ACF for the longest one. print >>sys.stderr, "%i bases used as coverage for sidak correction" % \ (total_coverage) sample_distribution = np.array([b["p"] for b in bediter(fpvals, col_num)]) combine = z_score_combine if z else stouffer_liptak for region, prows in region_info.iteritems(): # gen_sigma expects a list of bed dicts. sigma = gen_sigma_matrix(prows, acfs) ps = np.array([prow["p"] for prow in prows]) if ps.shape[0] == 0: print >>sys.stderr,("bad region", region) continue # calculate the SLK for the region. region_slk = combine(ps, sigma) if not region_slk["OK"]: print >>sys.stderr, "problem with:", region_slk, ps slk_p = region_slk["p"] sidak_slk_p = sidak(slk_p, int(region[2]) - int(region[1]), total_coverage) result = ["\t".join(region), slk_p, sidak_slk_p, "NA"] yield result