def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=False): """ calculate the correlation of the numbers in `col_num0` from the bed files in `fnames` at various lags. The lags are specified by distance. Partial autocorrelation may be calculated as well. Since the bed files may be very large, this attempts to be as memory efficient as possible while still being very fast for a pure python implementation. """ # reversing allows optimization below. imap = get_map() arg_list = [] # chaining for fname in fnames: # groupby chromosome. arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \ chromlist in \ groupby(bediter(fname, col_num0), lambda a: a["chrom"]))) unmerged_acfs = [] # separated by chrom. need to merge later. for chrom_acf in imap(_acf_by_chrom, arg_list): unmerged_acfs.append(chrom_acf) acfs = merge_acfs(unmerged_acfs) acf_res = {} xs = np.array([], dtype='f') ys = np.array([], dtype='f') # iterate over it backwards and remove to reduce memory. while len(acfs): lmin, lmax, xys = acfs.pop() if partial: xs, ys = np.array(xys["x"]), np.array(xys["y"]) else: # add the inner layers as we move out. xs = np.hstack((xs, xys["x"])) ys = np.hstack((ys, xys["y"])) if len(xs) == 0: print >>sys.stderr, "no values found at lag: %i-%i. skipping" \ % (lmin, lmax) continue if mlog: xs[xs == 0] = 0.5 * xs[xs > 0].min() ys[ys == 0] = 0.5 * ys[ys > 0].min() xs, ys = -np.log10(xs), -np.log10(ys) slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys) # NOTE: using pearson correlation, which assumes normality. # could switch to spearman as below. #corr, p_val = ss.spearmanr(xs, ys) if simple: acf_res[(lmin, lmax)] = corr else: acf_res[(lmin, lmax)] = (corr, len(xs), p_val) return sorted(acf_res.items())
def acf(fnames, lags, col_num0, partial=True, simple=False, mlog=True): """ calculate the correlation of the numbers in `col_num0` from the bed files in `fnames` at various lags. The lags are specified by distance. Partial autocorrelation may be calculated as well. Since the bed files may be very large, this attempts to be as memory efficient as possible while still being very fast for a pure python implementation. """ # reversing allows optimization below. imap = get_map() arg_list = [] # chaining for fname in fnames: # groupby chromosome. arg_list = chain(arg_list, ((list(chromlist), lags) for chrom, \ chromlist in \ groupby(bediter(fname, col_num0), lambda a: a["chrom"]))) unmerged_acfs = [] # separated by chrom. need to merge later. for chrom_acf in imap(_acf_by_chrom, arg_list): unmerged_acfs.append(chrom_acf) acfs = merge_acfs(unmerged_acfs) acf_res = {} xs = np.array([], dtype='f') ys = np.array([], dtype='f') # iterate over it backwards and remove to reduce memory. while len(acfs): lmin, lmax, xys = acfs.pop() if partial: xs, ys = np.array(xys["x"]), np.array(xys["y"]) else: # add the inner layers as we move out. xs = np.hstack((xs, xys["x"])) ys = np.hstack((ys, xys["y"])) if len(xs) == 0: print >>sys.stderr, "no values found at lag: %i-%i. skipping" \ % (lmin, lmax) continue if mlog: xs[xs == 0] = 1e-12 ys[ys == 0] = 1e-12 xs, ys = -np.log10(xs), -np.log10(ys) #slope, intercept, corr, p_val, stderr = ss.linregress(xs, ys) # NOTE: using pearson correlation, which assumes normality. # could switch to spearman as below. corr, p_val = ss.spearmanr(xs, ys) if simple: acf_res[(lmin, lmax)] = corr else: acf_res[(lmin, lmax)] = (corr, len(xs), p_val) return sorted(acf_res.items())
def adjust_pvals(fnames, col_num0, acfs, stringent=False): lag_max = acfs[-1][0][1] # parallelize if multiprocesing is installed. imap = get_map() arg_iter = [] for fname in fnames: # 9e-17 seems to be limit of precision for cholesky. arg_iter = chain(arg_iter, ((list(chromlist), lag_max, acfs, stringent) \ for key, chromlist in groupby(bediter(fname, col_num0, 9e-17), itemgetter("chrom")))) for results in imap(_slk_chrom, arg_iter): for r in results: yield r
def adjust_pvals(fnames, col_num0, acfs, z=True): lag_max = acfs[-1][0][1] # parallelize if multiprocesing is installed. imap = get_map() arg_iter = [] for fname in fnames: # 9e-17 seems to be limit of precision for cholesky. arg_iter = chain(arg_iter, ((list(chromlist), lag_max, acfs, z) \ for key, chromlist in groupby(bediter(fname, col_num0, 9e-117), itemgetter("chrom")))) for chrom, results in imap(_slk_chrom, arg_iter): yield chrom, results