def get_pvalue_nb(X, Lam, a, seed=123): np.random.seed(seed) (n, p) = X.shape probs = Lam / (a + Lam) C = np.random.uniform(size=(n, p)) pval = C * nbinom.cdf(X - 1, a, 1 - probs) + (1 - C) * nbinom.cdf( X, a, 1 - probs) return pval
def plot_pval_nb(counts, Lam, a, seed=123, title="", outfile="", save=True): np.random.seed(123) (n, p) = counts.shape probs = Lam / (a + Lam) C = np.random.uniform(size=(n, p)) pval = C * nbinom.cdf(counts - 1, a, 1 - probs) + (1 - C) * nbinom.cdf( counts, a, 1 - probs) plt.hist(pval.flatten(), bins=np.linspace(0, 1, 100)) plt.title(title) if save: plt.savefig(outfile) # else: # plt.show() plt.close()
def getNBPValue(mean0, var0, mean1, lower=False, log=False): """ Use negative binomial to calculate p-value Reference: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html#scipy.stats.nbinom """ from scipy.stats import nbinom n = len(mean0) nb_p = [mean0[i] / var0[i] for i in range(n)] # consisitent with R nb_n0 = [mean0[i] * mean0[i] / (var0[i] - mean0[i]) for i in range(n)] nb_n = [(lambda t: t if t >= 1 else 1)(x) for x in nb_n0] # if lower == True: if log == False: nb_p_low = nbinom.cdf(mean1, nb_n, nb_p) else: nb_p_low = nbinom.logcdf(mean1, nb_n, nb_p) return list(nb_p_low) else: if log == False: nb_p_low = nbinom.sf(mean1, nb_n, nb_p) else: nb_p_low = nbinom.logsf(mean1, nb_n, nb_p) return list(nb_p_low)
def calc_coverage_threshold(cov_dict): ''' calculate minimum coverage threshold for each key in cov_dict. see end of 'alternative parameterization' section of Negative binomial page and scipy negative binomial documentation for details of calculation. ''' threshold_dict = {} for g in cov_dict: mean = float(cov_dict[g]['mean']) var = float(cov_dict[g]['variance']) q = (var-mean)/var n = mean**2/(var-mean) p = 1 - q ## assert that I did the math correctly. assert(isclose(nbinom.mean(n,p), mean)) assert(isclose(nbinom.var(n,p), var)) ## find the integer threshold that includes ~95% of REL606 distribution, ## excluding 5% on the left hand side. my_threshold = nbinom.ppf(0.05,n,p) my_threshold_p = nbinom.cdf(my_threshold,n,p) threshold_dict[g] = {'threshold':str(my_threshold), 'threshold_p':str(my_threshold_p)} return threshold_dict
def getloglikelihood3(kmat, mu_estimate, alpha, sumup=False, log=True): ''' Get the log likelihood estimation of NB, using the current estimation of beta ''' if kmat.shape[0] != mu_estimate.shape[0]: raise ValueError( 'Count table dimension is not the same as mu vector dimension.') alpha = np.matrix(alpha).reshape(mu_estimate.shape[0], mu_estimate.shape[1]) kmat_r = np.round(kmat) mu_sq = np.multiply(mu_estimate, mu_estimate) var_vec = mu_estimate + np.multiply(alpha, mu_sq) nb_p = np.divide(mu_estimate, var_vec) nb_r = np.divide(mu_sq, var_vec - mu_estimate) p = nbinom.cdf(kmat, nb_r, nb_p) p = np.where(p < 0.5, p, 1 - p) if log: #logp=nbinom.logcdf(kmat_r,nb_r,nb_p) logp = np.log(p) else: logp = p #logp=nbinom.cdf(kmat,nb_r,nb_p) # if np.isnan(np.sum(logp)): #raise ValueError('nan values for log likelihood!') logp = np.where(np.isnan(logp), 0, logp) if sumup: return np.sum(logp) else: return logp
def getloglikelihood3(kmat,mu_estimate,alpha,sumup=False,log=True): ''' Get the log likelihood estimation of NB, using the current estimation of beta ''' if kmat.shape[0] != mu_estimate.shape[0]: raise ValueError('Count table dimension is not the same as mu vector dimension.') alpha=np.matrix(alpha).reshape(mu_estimate.shape[0],mu_estimate.shape[1]) kmat_r=np.round(kmat) mu_sq=np.multiply(mu_estimate,mu_estimate) var_vec=mu_estimate+np.multiply(alpha, mu_sq) nb_p=np.divide(mu_estimate,var_vec) nb_r=np.divide(mu_sq,var_vec-mu_estimate) p=nbinom.cdf(kmat,nb_r,nb_p) p=np.where(p<0.5,p,1-p) if log: #logp=nbinom.logcdf(kmat_r,nb_r,nb_p) logp=np.log(p) else: logp=p #logp=nbinom.cdf(kmat,nb_r,nb_p) # if np.isnan(np.sum(logp)): #raise ValueError('nan values for log likelihood!') logp=np.where(np.isnan(logp),0,logp) if sumup: return np.sum(logp) else: return logp
def threshold_n_binom(params, p_value, thresh_range=None): """ Determine a p-value threshold for a composite negative binomial and lognormal distribution based only on the value of the negative binomial. :param tuple params: Tuple of parameters for a combined \ negative/binomial (see :func:`~n_binom_plus_log_normal`) :param float p_value: P-value cut-off :param list thresh_range: Possible values to consider as a cut \ off (default is 0-500). :returns: Position above which the integral of the negative \ binomial is equal to the P-value cut-off. """ if thresh_range is None: thresh_range = list(range(500)) bin_n, bin_p, nm_delta, nm_scale, size = params bin_mean, bin_var = nbinom.stats(bin_n, bin_p) cumulative_dist = nbinom.cdf(thresh_range, bin_n, bin_p) prob_dist = sum_to_1(un_cumulative(cumulative_dist)) index = bisect_left(prob_dist[::-1], p_value) return thresh_range[::-1][index]
def nbinom_cdf(c, mu, alpha): """ Re-parameterised scipy negative binomial :param c: observed count :param mu: expected count :param alpha: dispersion (alpha = 1/r) :return: cumulative probability """ return nbinom.cdf(c, *convert_params(mu, alpha))
def nb_cpf(signal_vec): sig_mean = np.mean(signal_vec) sig_var = np.var(signal_vec) sig_prob = sig_mean / sig_var if sig_prob < 0.1: sig_prob = 0.1 elif sig_prob > 0.9: sig_prob = 0.9 sig_size = sig_mean * sig_prob / (1-sig_prob) nbp = 1-nbinom.cdf(signal_vec, sig_size, sig_prob) return nbp
def compute_pred_split(self, spec): tags = [self.sp, self.eg, self.bh, self.bd, self.ab, self.dam] params = [] for arr in self.p_outs: params.append(np.mean(arr)) denoms = [] r_loc = [] for ind in range(len(self.pred_data[spec][tags[0]])): denom = 1. for i in range(6): denom += self.pred_data[spec][ tags[i]][ind] * params[i] / self.rescale_rat[i] r_loc.append(params[6] / denom) #if denoms > 1: # denom = 1 denoms.append( (params[6] * (1 - params[7]) / params[7]) * 1. / denom) lls = nbinom.cdf(np.median(self.pred_data[spec][self.sc]), n=r_loc, p=params[7]) ll0 = nbinom.cdf(np.median(self.pred_data[spec][self.sc]), n=self.baseroot[0], p=self.baseroot[1]) llall = np.sum( nbinom.logpmf(self.pred_data[spec][self.sc], n=r_loc, p=params[7])) llallnull = np.sum( nbinom.logpmf(self.pred_data[spec][self.sc], n=self.baseroot[0], p=self.baseroot[1])) return (denoms, lls, ll0, np.median(self.pred_data[spec][self.sc]), llall, llallnull)
async def challenge( self, bot, event: Message, successes: int, chance: str, ): if successes < SUCCESSES_MIN or successes > SUCCESSES_MAX: await bot.say( event.channel, f'성공횟수는 {SUCCESSES_MIN}회 이상,' f' {SUCCESSES_MAX:,}회 이하로 입력해주세요!', ) return try: if chance.endswith('%'): p = Decimal(chance[:-1]) / 100 else: p = Decimal(chance) except InvalidOperation: await bot.say(event.channel, '정상적인 확률을 입력해주세요!') return if p < CHANCE_MIN or p > CHANCE_MAX: await bot.say( event.channel, f'확률값은 {to_percent(CHANCE_MIN)}% 이상,' f' {to_percent(CHANCE_MAX)}% 이하로 입력해주세요!', ) return if p / successes < CHANCE_MIN: await bot.say(event.channel, '입력하신 확률값에 비해 성공 횟수가 너무 많아요!') return counts = { int(math.ceil(nbinom.ppf(float(q), successes, float(p)))) for q in filter(lambda x: x >= p, CHANCES + [p]) } results = [ (x, Decimal(str(nbinom.cdf(x, successes, float(p))))) for x in sorted(counts) ] text = '\n'.join( f'- {tries+successes:,}번 시도하시면 {to_percent(ch, D001)}% 확률로' f' 목표 횟수만큼 성공할 수 있어요!' for tries, ch in results ) await bot.say( event.channel, f'{to_percent(p)}% 확률의 도전을 {successes:,}번' f' 성공시키려면 몇 회의 도전이 필요한지 알려드릴게요!\n{text}', )
def plot_pval_nb_vs_counts(counts, log, Lam, a, seed=123, title="", outfile="", save=True): np.random.seed(123) (n, p) = counts.shape probs = Lam / (a + Lam) C = np.random.uniform(size=(n, p)) pval = C * nbinom.cdf(counts - 1, a, 1 - probs) + (1 - C) * nbinom.cdf( counts, a, 1 - probs) X = np.log10(counts + 1) if log else counts xlabel = "log10(counts + 1)" if log else "counts" plt.scatter(X.flatten(), pval.flatten()) plt.xlabel(xlabel) plt.title(title) if save: plt.savefig(outfile) else: plt.show() plt.close()
def _ll_nbt(y, X, beta, alph, C=0): ''' Negative Binomial (truncated) Truncated densities for count models (Cameron & Trivedi, 2005, 680): .. math:: f(y|\beta, y \geq C+1) = \frac{f(y|\beta)}{1-F(C|\beta)} ''' Q = 0 mu = np.exp(np.dot(X, beta)) size = 1/alph*mu**Q prob = size/(size+mu) ll = nbinom.logpmf(y, size, prob) - np.log(1 - nbinom.cdf(C, size, prob)) return ll
def _ll_nbt(y, X, beta, alph, C=0): r''' Negative Binomial (truncated) Truncated densities for count models (Cameron & Trivedi, 2005, 680): .. math:: f(y|\beta, y \geq C+1) = \frac{f(y|\beta)}{1-F(C|\beta)} ''' Q = 0 mu = np.exp(np.dot(X, beta)) size = 1/alph*mu**Q prob = size/(size+mu) ll = nbinom.logpmf(y, size, prob) - np.log(1 - nbinom.cdf(C, size, prob)) return ll
def cumulative_neg_binom(x, n, p): """ Get the cumulative probability distribution for a negative binomial, over an array of points x that are log-scaled. :param x: Points at which to calculate probability density :type x: :class:`~numpy.ndarray` :param int n: Number of trials (see :data:`~scipy.stats.nbinom`) :param int p: Probability of success (see :data:`~scipy.stats.nbinom`) :returns: Cumulative probability distribution over x """ # x is in log, so transform it back x = list(10**x[1:]) # Add the point 0.0 x = [0.0] + x return nbinom.cdf(x, n, p)
def negbinom_test(x, mu, theta, offset): """ Test with negative binomial distribution Convert mu and theta to scipy parameters n and p: p = 1 / (theta * mu + 1) n = mu * p / (1 - p) Args: x (float): observed number of mutations (or gmean). mu (float): predicted number of mutations (mean of negative binomial distribution). theta (float): dispersion parameter of negative binomial distribution. Returns: float: p-value from NB CDF. pval = 1 - F(n<x) """ if offset == 1: # element with 0 bp return 1 p = 1 / (theta * mu + 1) n = mu * p / (1 - p) pval = 1 - nbinom.cdf(x, n, p, loc=1) return pval
def negbin_cdf(series): ''' This function takes a np.array and returns a negative binomial CDF for overdispersed count data. # prob. that x is less than or equal to val. ''' series = series.tolist() y = np.array([series]) y = y.flatten() # create intercept to fit a model with intercept intercept = np.ones(len(y)) # fit negative binomial m1 = sm.NegativeBinomial(y, intercept, loglike_method='nb2').fit() # retrieve mu mu = np.exp(m1.params[0]) # retrieve alpha alpha = m1.params[1] # set Q to zero for nb2 method, Q to 1 for nb1 method Q = 0 # derive size size = 1. / alpha * mu**Q # derive prob prob = size / (size + mu) return nbinom.cdf(y, n=size, p=prob)
def getNBPValue(mean0,var0,mean1, lower=False,log=False): """ Use negative binomial to calculate p-value Reference: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html#scipy.stats.nbinom """ from scipy.stats import nbinom n=len(mean0) nb_p=[mean0[i]/var0[i] for i in range(n)]; # consisitent with R nb_n0=[mean0[i]*mean0[i]/(var0[i]-mean0[i]) for i in range(n)] nb_n=[ (lambda t: t if t>=1 else 1)(x) for x in nb_n0] # if lower==True: if log==False: nb_p_low=nbinom.cdf(mean1,nb_n,nb_p) else: nb_p_low=nbinom.logcdf(mean1,nb_n,nb_p) return list(nb_p_low) else: if log==False: nb_p_low=nbinom.sf(mean1,nb_n,nb_p) else: nb_p_low=nbinom.logsf(mean1,nb_n,nb_p) return list(nb_p_low)
def _calculate_pvalues(x, r, p, mean, length): results = np.empty(length) for i in range(length): results[i] = nbinom.cdf(x[i], r[i], p[i]) return results
def test_cdf(self): n, p = sm.distributions.zinegbin.convert_params(1, 0.9, 1) nbinom_cdf = nbinom.cdf(2, n, p) zinbinom_cdf = sm.distributions.zinegbin.cdf(2, 1, 0.9, 2, 0) assert_allclose(nbinom_cdf, zinbinom_cdf, rtol=1e-12, atol=1e-12)
def test_cdf_p2(self): n, p = sm.distributions.zinegbin.convert_params(30, 0.1, 2) nbinom_cdf = nbinom.cdf(10, n, p) zinbinom_cdf = sm.distributions.zinegbin.cdf(10, 30, 0.1, 2, 0) assert_allclose(nbinom_cdf, zinbinom_cdf, rtol=1e-12, atol=1e-12)
for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(5) for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize(5) fig.suptitle("Distribución Geometrica") plt.show() # DISTRIBUCIÓN BINOMIAL NEGATIVA from scipy.stats import nbinom nbinom.pmf(k=5, n=2, p=0.1) nbinom.pmf(k=5, n=2, p=0.1, loc=0) nbinom.cdf(k=4, n=2, p=0.1) 1 - nbinom.cdf(k=4, n=2, p=0.1) nbinom.rvs(n=2, p=0.1, size=100) params = nbinom.stats(n=2, p=0.1, moments='mv') 'E(X) = {} y Var(X) = {}'.format(params[0], params[1]) n, p = 10, 0.25 x = np.arange(nbinom.ppf(0.01, n, p), nbinom.ppf(0.99, n, p)) fig = plt.figure(figsize=(5, 2.7)) ax = fig.add_subplot(1, 2, 1) ax.plot(x, nbinom.pmf(x, n, p), 'bo', ms=8, label="nbinom pmf") ax.vlines(x, 0, nbinom.pmf(x, n, p), color="b", lw=5, alpha=0.5)
def cdf(self, x: float) -> float: k = int(x) return float(nbinom.cdf(k, self.r, self.p))
x = np.arange(nbinom.ppf(0.01, n, p), nbinom.ppf(0.99, n, p)) ax.plot(x, nbinom.pmf(x, n, p), 'bo', ms=8, label='nbinom pmf') ax.vlines(x, 0, nbinom.pmf(x, n, p), colors='b', lw=5, alpha=0.5) # Alternatively, the distribution object can be called (as a function) # to fix the shape and location. This returns a "frozen" RV object holding # the given parameters fixed. # Freeze the distribution and display the frozen ``pmf``: rv = nbinom(n, p) ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1, label='frozen pmf') ax.legend(loc='best', frameon=False) plt.show() # Check accuracy of ``cdf`` and ``ppf``: prob = nbinom.cdf(x, n, p) np.allclose(x, nbinom.ppf(prob, n, p)) # True # Generate random numbers: r = nbinom.rvs(n, p, size=1000)
def Cumulative_den_fun(self, x, p): cdf = nbinom.cdf(x, p) return cdf
def cdf(self, k=0): return nbinom.cdf(k, self.r, self.p)
def _cdf(self, x, n, p): k = floor(x) if k == 0: return 0.0 else: return (nbinom.cdf(x, n, p) - nbinom.pmf(0, n, p)) / nbinom.sf(0, n, p)
[m, p] = [5, 0.35] N = [5, 10, 100, 1000, 10**5] fig = plt.figure() for n in N: print("n = ", n) for i in range(5): P = np.zeros(n) maxx = 0 for iteration in range(n): p_it = 0 counter = 0 while counter < m: xi = np.random.rand() if xi <= p: counter += 1 else: p_it += 1 P[iteration] = p_it if np.max(P) > maxx: maxx = np.max(P) print(P) x, y = data_ECDF(P) plt.step([0, *x, 1.1 * x[-1]], [0, *y, 1], label="NB ECDF{h}".format(h=i + 1), where='post') xx = range(maxx.astype(int)) plt.step(xx, nbinom.cdf(xx, m, p), color='k', lw=3, label="NB CDF") plt.legend(loc='lower right', frameon=False) plt.show()
def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, minInputQuantile, mode=1): ''' Args: prefix: prefix for the output bedFile: bin in BED format bctFile: fragment insert coverage in BCT format (made from procBam.py) covFile: covariates in TSV format bwFile: fragment coverage in BigWig format chromSize: chromosome sizes threshold: threshold to call peak minInputQuantile: minimum input coverage Returns: peak files (original peaks, merged peaks, and centered final peaks) ''' print("[%s] Calling peaks" % (timestamp())) ### load data print("[%s] Loading fragment coverages, and covariates" % (timestamp())) bct = np.loadtxt(bctFile, ndmin=2) # 0=input, 1=output, 2=normalized input cov = np.loadtxt(covFile, ndmin=2) # 3:=cov ### scale covariates to have mean 0 and sd 1 cov_scaled = preprocessing.scale(cov, axis=0) ### merge data mat = np.concatenate((bct[:, [1, 0, 2]], cov_scaled), axis=1) # 0=output, 1=input, 2=normalized input, 3:=cov del bct, cov, cov_scaled ### non sliding bins nonSliding = np.zeros(mat.shape[0], dtype=bool) ### initialize with False with open(bedFile, "r") as bed: lastchr, lastbin = "", 0 for i, bin in enumerate(bed.readlines()): if bin.split("\t")[0] != lastchr: lastchr = bin.split("\t")[0] lastbin = int(bin.split("\t")[2]) nonSliding[i] = True elif int(bin.split("\t")[1]) >= lastbin: lastbin = int(bin.split("\t")[2]) nonSliding[i] = True ### remove bins with input count of zero (i.e., untested region) OR extreme values (top 1%, i.e., sequencing artifacts) # minInput = np.quantile(mat[(mat[:, 1] > 0), 1], 0.01) # maxInput = np.quantile(mat[(mat[:, 1] > 0), 1], 0.99) minInput = 0 maxInput = np.quantile(mat[:, 1], 0.99) nonZeroInput = (mat[:, 1] > minInput) & (mat[:, 1] < maxInput) # minOutput = np.quantile(mat[:, 0], 0.01) # maxOutput = np.quantile(mat[:, 0], 0.99) # nonZeroInput = (mat[:, 1] > 0) & (mat[:, 0] > minOutput) & (mat[:, 0] < maxOutput) ### remove bins with normalized input count of zero (i.e., untested region) OR below "minimum threshold" defined by minInputQuantile # minInput = np.quantile(mat[(mat[:, 1] > 0), 1], float(minInputQuantile)) # print("[%s] Minimum Input Coverage: %f" % (timestamp(), minInput)) # nonZeroInput = mat[:, 1] > minInput ### calculate fold change fc = np.zeros(mat.shape[0]) fc[mat[:, 1] > 0] = mat[mat[:, 1] > 0, 0] / (mat[mat[:, 1] > 0, 2]) minOutputThreshold=0.9 testOutput = mat[:, 0] > np.quantile(mat[:, 0], float(minOutputThreshold)) # minFC = fc > 1 ### filtering bins print("[%s] Before filtering: %s" % (timestamp(), mat.shape[0])) print("[%s] Removing %i bins with insufficient input coverage" % (timestamp(), sum(np.invert(nonZeroInput)))) print("[%s] Bins with sufficient input coverage: %s" % (timestamp(), mat[nonZeroInput, :].shape[0])) print("[%s] Removing %i sliding bins" % (timestamp(), sum(np.invert(nonSliding)))) print("[%s] Bins with non-sliding window: %s" % (timestamp(), mat[nonSliding, :].shape[0])) print("[%s] After filtering: %s" % (timestamp(), mat[nonZeroInput & nonSliding, :].shape[0])) ### mode 2 uses "input" as offset variable if int(mode) == 2: print("[%s] Running Mode 2" % (timestamp())) ### remove input mat = np.delete(mat, 1, 1) ### formula x = ["x" + str(i) for i in range(1, mat.shape[1] - 1)] df = pd.DataFrame(mat[nonZeroInput & nonSliding, :], columns=["y", "exposure"] + x) formula = "y~" + "+".join(df.columns.difference(["y", "exposure"])) print("[%s] Fit using formula: %s" % (timestamp(), formula)) ### Initial parameter estimation using Poisson regression # print("[%s] Initial estimate" % (timestamp())) model0 = smf.glm(formula, data=df, family=sm.families.Poisson(), offset=np.log(df["exposure"])).fit() # print model0.summary() ### Estimate theta th0, _ = theta(mat[nonZeroInput & nonSliding, :][:, 0], model0.mu) print("[%s] Initial estimate of theta is %f" % (timestamp(), th0)) ### re-estimate beta with theta # print("[%s] Re-estimate of beta" % (timestamp())) model = smf.glm(formula, data=df, family=sm.families.NegativeBinomial(alpha=1 / th0), offset=np.log(df["exposure"])).fit(start_params=model0.params) # print model.summary() ### Re-estimate theta th, _ = theta(mat[nonZeroInput & nonSliding, :][:, 0], model.mu) print("[%s] Re-estimate of theta is %f" % (timestamp(), th)) ### predict print("[%s] Predicting expected counts for bins above a minimum threshold: %s" % ( timestamp(), mat[nonZeroInput & testOutput, :].shape[0])) df = pd.DataFrame(mat[nonZeroInput & testOutput, :], columns=["y", "exposure"] + x) y_hat = model.predict(df, offset=np.log(df["exposure"])) ### mode 1 uses "input" as covariate (default): else: print("[%s] Running Mode 1" % (timestamp())) ### remove normalized input mat = np.delete(mat, 2, 1) ### formula x = ["x" + str(i) for i in range(1, mat.shape[1])] df = pd.DataFrame(mat[nonZeroInput & nonSliding, :], columns=["y"] + x) formula = "y~" + "+".join(df.columns.difference(["y"])) print("[%s] Fit using formula: %s" % (timestamp(), formula)) ### Initial parameter estimation using Poisson regression # print("[%s] Initial estimate" % (timestamp())) model0 = smf.glm(formula, data=df, family=sm.families.Poisson()).fit() # print model0.summary() ### Estimate theta th0, _ = theta(mat[nonZeroInput & nonSliding, :][:, 0], model0.mu) print("[%s] Initial estimate of theta is %f" % (timestamp(), th0)) ### re-estimate beta with theta # print("[%s] Re-estimate of beta" % (timestamp())) model = smf.glm(formula, data=df, family=sm.families.NegativeBinomial(alpha=1 / th0)).fit( start_params=model0.params) # print model.summary() ### Re-estimate theta th, _ = theta(mat[nonZeroInput & nonSliding, :][:, 0], model.mu) print("[%s] Re-estimate of theta is %f" % (timestamp(), th)) ### predict print("[%s] Predicting expected counts for bins above a minimum threshold: %s" % ( timestamp(), mat[nonZeroInput & testOutput, :].shape[0])) df = pd.DataFrame(mat[nonZeroInput & testOutput, :], columns=["y"] + x) y_hat = model.predict(df) ### calculate P-value print("[%s] Calculating P-value" % (timestamp())) theta_hat = np.repeat(th, len(y_hat)) prob = th / (th + y_hat) ### prob=theta/(theta+mu) pval = 1 - nbinom.cdf(mat[nonZeroInput & testOutput, 0] - 1, n=theta_hat, p=prob) del mat ### multiple testing correction print("[%s] Multiple testing correction" % (timestamp())) _, pval_adj, _, _ = multi.multipletests(pval, method="fdr_bh") p_score = -np.log10(pval) q_score = -np.log10(pval_adj) ### output peak with open(prefix + ".peak.bed", "w") as out: with open(bedFile, "r") as bed: for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & testOutput))): if pval[i] <= float(threshold): out.write("%s\t%.3f\t%.3f\t%.3f\t%.5e\t%.5e\n" % ( bin.strip(), fc[nonZeroInput & testOutput][i], p_score[i], q_score[i], pval[i], pval_adj[i])) ### output p-val track print("[%s] Generating P-value bedGraph" % (timestamp())) with open(prefix + ".pval.bdg", "w") as out: with open(bedFile, "r") as bed: for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & testOutput))): out.write("%s\t%.3f\n" % (bin.strip(), abs(p_score[i]))) del p_score, q_score, pval, pval_adj ### make bigWig track print("[%s] Making BigWig tracks" % (timestamp())) make_bigwig(prefix=prefix, bedFile=bedFile, bctFile=bctFile, chromSize=chromSize, bedGraphFile=prefix + ".pval.bdg") safe_remove(prefix + ".pval.bdg") ### merge peak print("[%s] Merge peaks" % (timestamp())) pybedtools.BedTool(prefix + ".peak.bed").merge(c=[4, 5, 6, 7, 8], o=["max", "max", "max", "min", "min"]).saveas( prefix + ".peak.merged.bed") ### center merged peak print("[%s] Finalizing peaks" % (timestamp())) center_peak(bwFile=bwFile, peakFile=prefix + ".peak.merged.bed", centeredPeakFile=prefix + ".peak.final.bed") print("[%s] Done" % (timestamp()))
def call_peak(prefix, bedFile, bctFile, chromSize, bwFile, covFile=None, threshold=0.05, mode=1, minCoverage=10, extQuantile=1e-5): ''' calls peak Args: (Required) prefix: prefix for the output bedFile: bin in BED format bctFile: fragment insert coverage in BCT format (made from procBam.py) chromSize: chromosome sizes bwFile: fragment coverage in BigWig format (Optional) covFile: covariates in TSV format threshold: threshold to call peak mode: 1 - using input as covariate 2 - using input as offset minCoverage: minimum coverage required for peak extQuantile: for removing genomic bins having extreme quantile (sequencing artifact) Returns: peak files (original peaks and center-merged final peaks) peak format: chr, start, end, foldChg, inputCt, outputCt, pval, qval ''' print("[%s] Calling peaks" % (timestamp())) ### load data print("[%s] Loading fragment coverages" % (timestamp())) bct = np.loadtxt(bctFile, ndmin=2) # BCT 0=input, 1=output, 2=normalized input if covFile: print("[%s] Loading covariates" % (timestamp())) cov = np.loadtxt(covFile, ndmin=2) # COV 3:=cov ### scale covariates to have mean 0 and sd 1 cov_scaled = preprocessing.scale(cov, axis=0) ### merge data mat = np.concatenate((bct[:, [1, 0, 2]], cov_scaled), axis=1) # MAT 0=output, 1=input, 2=normalized input, 3:=cov del cov, cov_scaled else: print("[%s] Running without covariates" % (timestamp())) ### merge data mat = bct[:, [1, 0, 2]] # MAT 0=output, 1=input, 2=normalized input del bct bct_o = mat[:, 0] bct_i = mat[:, 1] bct_n = mat[:, 2] ### non sliding bins nonSliding = np.zeros(mat.shape[0], dtype=bool) ### initialize with False with open(bedFile, "r") as bed: lastchr, lastbin = "", 0 for i, bin in enumerate(bed.readlines()): if bin.split("\t")[0] != lastchr: lastchr = bin.split("\t")[0] lastbin = int(bin.split("\t")[2]) nonSliding[i] = True elif int(bin.split("\t")[1]) >= lastbin: lastbin = int(bin.split("\t")[2]) nonSliding[i] = True ### remove bins with input count of zero (i.e., untested region) OR extreme values (top 0.001%, i.e., sequencing artifacts) minInput = minCoverage maxInput = np.quantile(bct_i[bct_i > 0], (1 - extQuantile)) # print(minInput, maxInput) minOutput = minCoverage maxOutput = np.quantile(bct_o[bct_o > 0], (1 - extQuantile)) # print(minOutput, maxOutput) ### calculate fold change fc = np.zeros(mat.shape[0]) fc[bct_n > 0] = bct_o[bct_n > 0] / (bct_n[bct_n > 0]) ### fc = output / normalized_input ### train / test genomic bin ### trainingBin = (bct_i > minInput) & (bct_i < maxInput) & (bct_o > minOutput) & (bct_o < maxOutput) & nonSliding testingBin = (bct_i > minInput) & (bct_i < maxInput) & (fc > 1.5) ### bins w/ FC > 1.5 are tested for statistical significance ### filtering bins print("[%s] Total genomic bins: %s" % (timestamp(), '{:,}'.format(mat.shape[0]))) print("[%s] Total non-overlapping genomic bins: %s" % (timestamp(), '{:,}'.format(sum(nonSliding)))) print("[%s] Removing bins with input counts larger than %s for training and testing" % (timestamp(), '{:,}'.format(maxInput))) print("[%s] Removing bins with output counts larger than %s for training" % (timestamp(), '{:,}'.format(maxOutput))) print("[%s] Total genomic bins used for training: %s" % (timestamp(), '{:,}'.format(mat[trainingBin, :].shape[0]))) print("[%s] Total genomic bins used for testing: %s" % (timestamp(), '{:,}'.format(mat[testingBin, :].shape[0]))) ### mode 2 uses "input" as offset variable if int(mode) == 2: print("[%s] Running Mode 2" % (timestamp())) ### remove input mat_model = np.delete(mat, 1, 1) ### formula x = ["x" + str(i) for i in range(1, mat_model.shape[1] - 1)] df = pd.DataFrame(mat_model[trainingBin, :], columns=["y", "exposure"] + x) formula = "y~" + "+".join(df.columns.difference(["y", "exposure"])) print("[%s] Fit using formula: %s" % (timestamp(), formula)) ### Initial parameter estimation using Poisson regression # print("[%s] Initial estimate" % (timestamp())) model0 = smf.glm(formula, data=df, family=sm.families.Poisson(), offset=np.log(df["exposure"])).fit() # print model0.summary() ### Estimate theta th0, _ = theta(mat_model[trainingBin, :][:, 0], model0.mu) print("[%s] Initial estimate of theta is %f" % (timestamp(), th0)) ### re-estimate beta with theta # print("[%s] Re-estimate of beta" % (timestamp())) model = smf.glm(formula, data=df, family=sm.families.NegativeBinomial(alpha=1 / th0), offset=np.log(df["exposure"])).fit(start_params=model0.params) # print model.summary() ### Re-estimate theta th, _ = theta(mat_model[trainingBin, :][:, 0], model.mu) print("[%s] Re-estimate of theta is %f" % (timestamp(), th)) ### predict print("[%s] Predicting expected counts" % (timestamp())) df = pd.DataFrame(mat_model[testingBin, :], columns=["y", "exposure"] + x) y_hat = model.predict(df, offset=np.log(df["exposure"])) ### mode 1 uses "input" as covariate (default): else: print("[%s] Running Mode 1" % (timestamp())) ### remove normalized input mat_model = np.delete(mat, 2, 1) ### formula x = ["x" + str(i) for i in range(1, mat_model.shape[1])] df = pd.DataFrame(mat_model[trainingBin, :], columns=["y"] + x) formula = "y~" + "+".join(df.columns.difference(["y"])) print("[%s] Fit using formula: %s" % (timestamp(), formula)) ### Initial parameter estimation using Poisson regression # print("[%s] Initial estimate" % (timestamp())) model0 = smf.glm(formula, data=df, family=sm.families.Poisson()).fit() # print model0.summary() ### Estimate theta th0, _ = theta(mat_model[trainingBin, :][:, 0], model0.mu) print("[%s] Initial estimate of theta is %f" % (timestamp(), th0)) ### re-estimate beta with theta # print("[%s] Re-estimate of beta" % (timestamp())) model = smf.glm(formula, data=df, family=sm.families.NegativeBinomial(alpha=1 / th0)).fit(start_params=model0.params) # print model.summary() ### Re-estimate theta th, _ = theta(mat_model[trainingBin, :][:, 0], model.mu) print("[%s] Re-estimate of theta is %f" % (timestamp(), th)) ### predict print("[%s] Predicting expected counts" % (timestamp())) df = pd.DataFrame(mat_model[testingBin, :], columns=["y"] + x) y_hat = model.predict(df) ### calculate P-value print("[%s] Calculating P-value" % (timestamp())) theta_hat = np.repeat(th, len(y_hat)) prob = th / (th + y_hat) ### prob=theta/(theta+mu) pval = 1 - nbinom.cdf(k=mat_model[testingBin, 0] - 1, n=theta_hat, p=prob) ### multiple testing correction print("[%s] Multiple testing correction" % (timestamp())) _, qval, _, _ = multi.multipletests(pval, method="fdr_bh") nlog10pval = -np.log10(pval) nlog10qval = -np.log10(qval) ### output initial peaks print("[%s] Output initial peaks" % (timestamp())) with open(prefix + ".peak.bed", "w") as out: with open(bedFile, "r") as bed: for i, bin in enumerate(list(compress(bed.readlines(), testingBin))): if qval[i] <= float(threshold): ### chr, start, end, foldChg, inputCov, outputCov, -log10(pval), -log10(qval) out.write("%s\t%.3f\t%i\t%i\t%.3f\t%.3f\n" % (bin.strip(), fc[testingBin][i], mat[testingBin, 1][i], mat[testingBin, 0][i], nlog10pval[i], nlog10qval[i])) ### generate various bdg tracks print("[%s] Generating bedGraph files" % (timestamp())) with open(prefix + ".pval.bdg", "w") as fp, open(prefix + ".qval.bdg", "w") as fq, open(prefix + ".fc.bdg", "w") as ff: with open(bedFile, "r") as bed: for i, bin in enumerate(bed.readlines()): ff.write("%s\t%.3f\n" % (bin.strip(), fc[i])) with open(bedFile, "r") as bed: for i, bin in enumerate(list(compress(bed.readlines(), testingBin))): fp.write("%s\t%.3f\n" % (bin.strip(), nlog10pval[i])) fq.write("%s\t%.3f\n" % (bin.strip(), nlog10qval[i])) del mat, mat_model, nlog10pval, nlog10qval, pval, qval ### make bigWig track print("[%s] Making BigWig tracks" % (timestamp())) with open(bedFile) as bed: c1, s1, e1 = bed.readline().strip().split('\t') c2, s2, e2 = bed.readline().strip().split('\t') w = int(e1) - int(s1) s = int(s2) - int(s1) bdg2bw(bdgFile=prefix + ".fc.bdg", bwFile=prefix + ".fc.bw", chromSize=chromSize, window=w, step=s) safe_remove(prefix + ".fc.bdg") bdg2bw(bdgFile=prefix + ".pval.bdg", bwFile=prefix + ".pval.bw", chromSize=chromSize, window=w, step=s) safe_remove(prefix + ".pval.bdg") bdg2bw(bdgFile=prefix + ".qval.bdg", bwFile=prefix + ".qval.bw", chromSize=chromSize, window=w, step=s) safe_remove(prefix + ".qval.bdg") ### center peak print("[%s] Center peaks" % (timestamp())) center_peak(bwFile=bwFile, peakFile=prefix + ".peak.bed", centeredPeakFile=prefix + ".peak.centered.bed") ### merge peak print("[%s] Merge peaks" % (timestamp())) pybedtools.BedTool(prefix + ".peak.centered.bed").merge(c=[4, 6, 7, 8], o=["max", "max", "max", "max"]).saveas(prefix + ".peak.centered.merged.bed") ### finalize peak print("[%s] Finalize peaks" % (timestamp())) peak = pd.read_csv(prefix + ".peak.centered.merged.bed", sep='\t', header=None) peak.columns = ['chr', 'start', 'end', 'fc', 'cov', 'nlog10pval', 'nlog10qval'] peak['idx'] = peak.index peak['strand'] = "." peak['score'] = [min(int(round(x)), 1000) for x in peak['fc'] * 100] peak = peak.sort_values(by=['fc'], ascending=False) peak['name'] = ['peak_' + str(x) for x in peak.reset_index().index + 1] peak = peak.sort_values(by=['idx']) final = peak[['chr', 'start', 'end', 'name', 'score', 'strand', 'fc', 'cov', 'nlog10pval', 'nlog10qval']] final.to_csv(prefix + '.peak.final.bed', sep='\t', float_format='%.3f', index=False, header=False) ### remove intermediate peak files safe_remove(prefix + ".peak.centered.bed") safe_remove(prefix + ".peak.centered.merged.bed") print("[%s] Done" % (timestamp()))
def _cdf(self, x, mu, alpha, p, w): s, p = self.convert_params(mu, alpha, p) # construct cdf from standard negative binomial cdf # and the w inflation of zero return w + nbinom.cdf(x, s, p) * (1 - w)
def nbp_bg_adj(sample_sig_file, input_sig_file, outputname): ### read input files sample = read2d_array(sample_sig_file, float) background = read2d_array(input_sig_file, float) ### set threshold to ignore thesh = 0 ### get sample mean & var sample_non0 = sample[sample > thesh] sample_mean = np.mean(sample_non0) sample_var = np.var(sample_non0) ### get negative binomial parameters from sample track regions sample_prob = sample_mean / sample_var if sample_prob < 0.1: sample_prob = 0.1 if sample_prob >= 0.9: sample_prob = 0.9 ### get size parameter for negative binomial distribution p-value (1st round) sample_size = sample_mean * sample_prob / (1 - sample_prob) ### get background mean & var background_non0 = background[background > thesh] bg_mean = np.mean(background_non0) bg_var = np.var(background_non0) print('check input track overdispersion in background regions, var/mean=' + str(round(bg_var / bg_mean, 3))) print(sample_prob) print(sample_size) print(len(background_non0)) print(bg_mean) print(bg_var) ### 1st round negative binomial p-value i = 0 nb_pval_list = np.empty((0, ), float) print(nb_pval_list.shape) for sig in sample: if i % 10000 == 0: print(i) i = i + 1 nb_pval_tmp = np.array( 1 - nbinom.cdf(sig, sample_size, sample_prob, loc=0)) nb_pval_list = np.concatenate((nb_pval_list, nb_pval_tmp)) ############### second round ### get sample bg regions sample_bg = sample[nb_pval_list >= 0.001, ] sample_bg_non0 = sample_bg[sample_bg > thesh] sample_bg_mean = np.mean(sample_bg_non0) sample_bg_var = np.var(sample_bg_non0) print( 'check signal track overdispersion in background regions, var/mean=' + str(round(sample_bg_var / sample_bg_mean, 3))) print(sample_bg_mean) print(sample_bg_var) print(len(sample_bg_non0)) ### get negative binomial parameters from signal track bg regions sample_bg_prob = sample_bg_mean / sample_bg_var if sample_bg_prob < 0.1: sample_bg_prob = 0.1 if sample_bg_prob >= 0.9: sample_bg_prob = 0.9 ### get size parameter for negative binomial distribution p-value (2nd round) sample_bg_size = sample_bg_mean * sample_bg_prob / (1 - sample_bg_prob) ### get background bg regions background_bg = background[nb_pval_list >= 0.001, ] background_bg_non0 = background_bg[background_bg > thesh] background_bg_mean = np.mean(background_bg_non0) background_bg_var = np.var(background_bg_non0) print('check input track overdispersion in background regions, var/mean=' + str(round(background_bg_var / background_bg_mean, 3))) print(sig_bg_prob) print(sig_bg_size) print(len(input_bg_non0)) print(input_bg_mean) print(inpy_bg_var) ### 2nd round negative binomial p-value i = 0 nb_pval_list = np.empty((0, ), float) for sig, bg in zip(sample, background): if i % 100000 == 0: print(i) i = i + 1 nb_pval_tmp = np.array(1 - nbinom.cdf(sig, sample_bg_size * (bg + 1) / (background_bg_mean + 1), sample_bg_prob, loc=0)) nb_pval_list = np.concatenate((nb_pval_list, nb_pval_tmp)) ### convert to np array nb_pval_list = -np.log10(nb_pval_list) nb_pval_list = nb_pval_list.reshape(nb_pval_list.shape[0], 1) ### write output write2d_array(nb_pval_list, outputname + '.nbp_2r.txt') ### info vector mvsp = open(outputname + '.mvsp.txt', 'w') mvsp.write(str(sample_bg_mean) + '\t') mvsp.write(str(sample_bg_var) + '\t') mvsp.write(str(sample_bg_size) + '\t') mvsp.write(str(sample_bg_prob) + '\n') mvsp.close()