示例#1
0
def get_pvalue_nb(X, Lam, a, seed=123):
    np.random.seed(seed)
    (n, p) = X.shape
    probs = Lam / (a + Lam)
    C = np.random.uniform(size=(n, p))
    pval = C * nbinom.cdf(X - 1, a, 1 - probs) + (1 - C) * nbinom.cdf(
        X, a, 1 - probs)
    return pval
示例#2
0
def plot_pval_nb(counts, Lam, a, seed=123, title="", outfile="", save=True):
    np.random.seed(123)
    (n, p) = counts.shape
    probs = Lam / (a + Lam)
    C = np.random.uniform(size=(n, p))
    pval = C * nbinom.cdf(counts - 1, a, 1 - probs) + (1 - C) * nbinom.cdf(
        counts, a, 1 - probs)
    plt.hist(pval.flatten(), bins=np.linspace(0, 1, 100))
    plt.title(title)
    if save:
        plt.savefig(outfile)
    # else:
    #     plt.show()
    plt.close()
def getNBPValue(mean0, var0, mean1, lower=False, log=False):
    """
  Use negative binomial to calculate p-value
  Reference:
  http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html#scipy.stats.nbinom
  """
    from scipy.stats import nbinom
    n = len(mean0)
    nb_p = [mean0[i] / var0[i] for i in range(n)]
    # consisitent with R
    nb_n0 = [mean0[i] * mean0[i] / (var0[i] - mean0[i]) for i in range(n)]
    nb_n = [(lambda t: t if t >= 1 else 1)(x) for x in nb_n0]
    #
    if lower == True:
        if log == False:
            nb_p_low = nbinom.cdf(mean1, nb_n, nb_p)
        else:
            nb_p_low = nbinom.logcdf(mean1, nb_n, nb_p)
        return list(nb_p_low)
    else:
        if log == False:
            nb_p_low = nbinom.sf(mean1, nb_n, nb_p)
        else:
            nb_p_low = nbinom.logsf(mean1, nb_n, nb_p)
        return list(nb_p_low)
def calc_coverage_threshold(cov_dict):
    '''
    calculate minimum coverage threshold for each key in cov_dict.
    see end of 'alternative parameterization' section of Negative binomial page
    and scipy negative binomial documentation for details of calculation.
    '''
    threshold_dict = {}
    for g in cov_dict:
        mean = float(cov_dict[g]['mean'])
        var = float(cov_dict[g]['variance'])
        q = (var-mean)/var
        n = mean**2/(var-mean)
        p = 1 - q

        ## assert that I did the math correctly.
        assert(isclose(nbinom.mean(n,p), mean))
        assert(isclose(nbinom.var(n,p), var))

        ## find the integer threshold that includes ~95% of REL606 distribution,
        ## excluding 5% on the left hand side.
        my_threshold = nbinom.ppf(0.05,n,p)
        my_threshold_p = nbinom.cdf(my_threshold,n,p)
        threshold_dict[g] = {'threshold':str(my_threshold),
                             'threshold_p':str(my_threshold_p)}
    return threshold_dict
示例#5
0
def getloglikelihood3(kmat, mu_estimate, alpha, sumup=False, log=True):
    '''
    Get the log likelihood estimation of NB, using the current estimation of beta
    '''
    if kmat.shape[0] != mu_estimate.shape[0]:
        raise ValueError(
            'Count table dimension is not the same as mu vector dimension.')
    alpha = np.matrix(alpha).reshape(mu_estimate.shape[0],
                                     mu_estimate.shape[1])
    kmat_r = np.round(kmat)
    mu_sq = np.multiply(mu_estimate, mu_estimate)
    var_vec = mu_estimate + np.multiply(alpha, mu_sq)
    nb_p = np.divide(mu_estimate, var_vec)
    nb_r = np.divide(mu_sq, var_vec - mu_estimate)
    p = nbinom.cdf(kmat, nb_r, nb_p)
    p = np.where(p < 0.5, p, 1 - p)
    if log:
        #logp=nbinom.logcdf(kmat_r,nb_r,nb_p)
        logp = np.log(p)
    else:
        logp = p
        #logp=nbinom.cdf(kmat,nb_r,nb_p)


#

    if np.isnan(np.sum(logp)):
        #raise ValueError('nan values for log likelihood!')
        logp = np.where(np.isnan(logp), 0, logp)
    if sumup:
        return np.sum(logp)
    else:
        return logp
示例#6
0
def getloglikelihood3(kmat,mu_estimate,alpha,sumup=False,log=True):
    '''
    Get the log likelihood estimation of NB, using the current estimation of beta
    '''
    if kmat.shape[0] != mu_estimate.shape[0]:
        raise ValueError('Count table dimension is not the same as mu vector dimension.')
    alpha=np.matrix(alpha).reshape(mu_estimate.shape[0],mu_estimate.shape[1])
    kmat_r=np.round(kmat)
    mu_sq=np.multiply(mu_estimate,mu_estimate)
    var_vec=mu_estimate+np.multiply(alpha, mu_sq)
    nb_p=np.divide(mu_estimate,var_vec)
    nb_r=np.divide(mu_sq,var_vec-mu_estimate)
    p=nbinom.cdf(kmat,nb_r,nb_p)
    p=np.where(p<0.5,p,1-p)
    if log:
        #logp=nbinom.logcdf(kmat_r,nb_r,nb_p)
        logp=np.log(p)
    else:
        logp=p
        #logp=nbinom.cdf(kmat,nb_r,nb_p)
#

    if np.isnan(np.sum(logp)):
        #raise ValueError('nan values for log likelihood!')
        logp=np.where(np.isnan(logp),0,logp)
    if sumup:
        return np.sum(logp)
    else:
        return logp
示例#7
0
def threshold_n_binom(params, p_value, thresh_range=None):
    """
    Determine a p-value threshold for a composite negative binomial
    and lognormal distribution based only on the value of the negative
    binomial.

    :param tuple params: Tuple of parameters for a combined \
            negative/binomial (see :func:`~n_binom_plus_log_normal`)
    :param float p_value: P-value cut-off
    :param list thresh_range: Possible values to consider as a cut \
            off (default is 0-500).
    :returns: Position above which the integral of the negative \
            binomial is equal to the P-value cut-off.
    """

    if thresh_range is None:
        thresh_range = list(range(500))

    bin_n, bin_p, nm_delta, nm_scale, size = params

    bin_mean, bin_var = nbinom.stats(bin_n, bin_p)

    cumulative_dist = nbinom.cdf(thresh_range, bin_n, bin_p)

    prob_dist = sum_to_1(un_cumulative(cumulative_dist))
    index = bisect_left(prob_dist[::-1], p_value)
    return thresh_range[::-1][index]
示例#8
0
文件: hiczin.py 项目: cerebis/HiCzin
def nbinom_cdf(c, mu, alpha):
    """
    Re-parameterised scipy negative binomial

    :param c: observed count
    :param mu: expected count
    :param alpha: dispersion (alpha = 1/r)
    :return: cumulative probability
    """
    return nbinom.cdf(c, *convert_params(mu, alpha))
示例#9
0
def nb_cpf(signal_vec):
	sig_mean = np.mean(signal_vec)
	sig_var = np.var(signal_vec)
	sig_prob = sig_mean / sig_var
	if sig_prob < 0.1:
		sig_prob = 0.1
	elif sig_prob > 0.9:
		sig_prob = 0.9
	sig_size = sig_mean * sig_prob / (1-sig_prob)
	nbp = 1-nbinom.cdf(signal_vec, sig_size, sig_prob)
	return nbp
示例#10
0
    def compute_pred_split(self, spec):

        tags = [self.sp, self.eg, self.bh, self.bd, self.ab, self.dam]

        params = []

        for arr in self.p_outs:
            params.append(np.mean(arr))

        denoms = []
        r_loc = []

        for ind in range(len(self.pred_data[spec][tags[0]])):
            denom = 1.
            for i in range(6):
                denom += self.pred_data[spec][
                    tags[i]][ind] * params[i] / self.rescale_rat[i]
            r_loc.append(params[6] / denom)
            #if denoms > 1:
            #    denom = 1
            denoms.append(
                (params[6] * (1 - params[7]) / params[7]) * 1. / denom)

        lls = nbinom.cdf(np.median(self.pred_data[spec][self.sc]),
                         n=r_loc,
                         p=params[7])
        ll0 = nbinom.cdf(np.median(self.pred_data[spec][self.sc]),
                         n=self.baseroot[0],
                         p=self.baseroot[1])

        llall = np.sum(
            nbinom.logpmf(self.pred_data[spec][self.sc], n=r_loc, p=params[7]))
        llallnull = np.sum(
            nbinom.logpmf(self.pred_data[spec][self.sc],
                          n=self.baseroot[0],
                          p=self.baseroot[1]))

        return (denoms, lls, ll0, np.median(self.pred_data[spec][self.sc]),
                llall, llallnull)
示例#11
0
文件: gacha.py 项目: iCodeIN/yui
 async def challenge(
     self,
     bot,
     event: Message,
     successes: int,
     chance: str,
 ):
     if successes < SUCCESSES_MIN or successes > SUCCESSES_MAX:
         await bot.say(
             event.channel,
             f'성공횟수는 {SUCCESSES_MIN}회 이상,'
             f' {SUCCESSES_MAX:,}회 이하로 입력해주세요!',
         )
         return
     try:
         if chance.endswith('%'):
             p = Decimal(chance[:-1]) / 100
         else:
             p = Decimal(chance)
     except InvalidOperation:
         await bot.say(event.channel, '정상적인 확률을 입력해주세요!')
         return
     if p < CHANCE_MIN or p > CHANCE_MAX:
         await bot.say(
             event.channel,
             f'확률값은 {to_percent(CHANCE_MIN)}% 이상,'
             f' {to_percent(CHANCE_MAX)}% 이하로 입력해주세요!',
         )
         return
     if p / successes < CHANCE_MIN:
         await bot.say(event.channel, '입력하신 확률값에 비해 성공 횟수가 너무 많아요!')
         return
     counts = {
         int(math.ceil(nbinom.ppf(float(q), successes, float(p))))
         for q in filter(lambda x: x >= p, CHANCES + [p])
     }
     results = [
         (x, Decimal(str(nbinom.cdf(x, successes, float(p)))))
         for x in sorted(counts)
     ]
     text = '\n'.join(
         f'- {tries+successes:,}번 시도하시면 {to_percent(ch, D001)}% 확률로'
         f' 목표 횟수만큼 성공할 수 있어요!'
         for tries, ch in results
     )
     await bot.say(
         event.channel,
         f'{to_percent(p)}% 확률의 도전을 {successes:,}번'
         f' 성공시키려면 몇 회의 도전이 필요한지 알려드릴게요!\n{text}',
     )
示例#12
0
def plot_pval_nb_vs_counts(counts,
                           log,
                           Lam,
                           a,
                           seed=123,
                           title="",
                           outfile="",
                           save=True):
    np.random.seed(123)
    (n, p) = counts.shape
    probs = Lam / (a + Lam)
    C = np.random.uniform(size=(n, p))
    pval = C * nbinom.cdf(counts - 1, a, 1 - probs) + (1 - C) * nbinom.cdf(
        counts, a, 1 - probs)
    X = np.log10(counts + 1) if log else counts
    xlabel = "log10(counts + 1)" if log else "counts"
    plt.scatter(X.flatten(), pval.flatten())
    plt.xlabel(xlabel)
    plt.title(title)
    if save:
        plt.savefig(outfile)
    else:
        plt.show()
    plt.close()
示例#13
0
def _ll_nbt(y, X, beta, alph, C=0):
    '''
    Negative Binomial (truncated)

    Truncated densities for count models (Cameron & Trivedi, 2005, 680):

    .. math::

        f(y|\beta, y \geq C+1) = \frac{f(y|\beta)}{1-F(C|\beta)}
    '''
    Q = 0
    mu = np.exp(np.dot(X, beta))
    size = 1/alph*mu**Q
    prob = size/(size+mu)
    ll = nbinom.logpmf(y, size, prob) - np.log(1 - nbinom.cdf(C, size, prob))
    return ll
示例#14
0
def _ll_nbt(y, X, beta, alph, C=0):
    r'''
    Negative Binomial (truncated)

    Truncated densities for count models (Cameron & Trivedi, 2005, 680):

    .. math::

        f(y|\beta, y \geq C+1) = \frac{f(y|\beta)}{1-F(C|\beta)}
    '''
    Q = 0
    mu = np.exp(np.dot(X, beta))
    size = 1/alph*mu**Q
    prob = size/(size+mu)
    ll = nbinom.logpmf(y, size, prob) - np.log(1 - nbinom.cdf(C, size, prob))
    return ll
示例#15
0
def cumulative_neg_binom(x, n, p):
    """
    Get the cumulative probability distribution for a negative
    binomial, over an array of points x that are log-scaled.

    :param x: Points at which to calculate probability density
    :type x: :class:`~numpy.ndarray`
    :param int n: Number of trials (see :data:`~scipy.stats.nbinom`)
    :param int p: Probability of success (see :data:`~scipy.stats.nbinom`)
    :returns: Cumulative probability distribution over x
    """

    # x is in log, so transform it back
    x = list(10**x[1:])

    # Add the point 0.0
    x = [0.0] + x

    return nbinom.cdf(x, n, p)
示例#16
0
def negbinom_test(x, mu, theta, offset):
    """ Test with negative binomial distribution

    Convert mu and theta to scipy parameters n and p:

    p = 1 / (theta * mu + 1)
    n = mu * p / (1 - p)

    Args:
        x (float): observed number of mutations (or gmean).
        mu (float): predicted number of mutations (mean of negative binomial distribution).
        theta (float): dispersion parameter of negative binomial distribution.

    Returns:
        float: p-value from NB CDF. pval = 1 - F(n<x)

    """
    if offset == 1:  # element with 0 bp
        return 1
    p = 1 / (theta * mu + 1)
    n = mu * p / (1 - p)
    pval = 1 - nbinom.cdf(x, n, p, loc=1)
    return pval
示例#17
0
def negbin_cdf(series):
    '''
    This function takes a np.array and returns a negative
    binomial CDF for overdispersed count data.
    # prob. that x is less than or equal to val.
    '''
    series = series.tolist()
    y = np.array([series])
    y = y.flatten()
    # create intercept to fit a model with intercept
    intercept = np.ones(len(y))
    # fit negative binomial
    m1 = sm.NegativeBinomial(y, intercept, loglike_method='nb2').fit()
    # retrieve mu
    mu = np.exp(m1.params[0])
    # retrieve alpha
    alpha = m1.params[1]
    # set Q to zero for nb2 method, Q to 1 for nb1 method
    Q = 0
    # derive size
    size = 1. / alpha * mu**Q
    # derive prob
    prob = size / (size + mu)
    return nbinom.cdf(y, n=size, p=prob)
示例#18
0
def getNBPValue(mean0,var0,mean1, lower=False,log=False):
  """
  Use negative binomial to calculate p-value
  Reference:
  http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html#scipy.stats.nbinom
  """
  from scipy.stats import nbinom
  n=len(mean0)
  nb_p=[mean0[i]/var0[i] for i in range(n)]; # consisitent with R
  nb_n0=[mean0[i]*mean0[i]/(var0[i]-mean0[i]) for i in range(n)]
  nb_n=[ (lambda t: t if t>=1 else 1)(x) for x in nb_n0]
  #
  if lower==True:
    if log==False:
      nb_p_low=nbinom.cdf(mean1,nb_n,nb_p)
    else:
      nb_p_low=nbinom.logcdf(mean1,nb_n,nb_p)
    return list(nb_p_low)
  else:
    if log==False:
      nb_p_low=nbinom.sf(mean1,nb_n,nb_p)
    else:
      nb_p_low=nbinom.logsf(mean1,nb_n,nb_p)
    return list(nb_p_low)
示例#19
0
def _calculate_pvalues(x, r, p, mean, length):
    results = np.empty(length)
    for i in range(length):
        results[i] = nbinom.cdf(x[i], r[i], p[i])
    return results
示例#20
0
 def test_cdf(self):
     n, p = sm.distributions.zinegbin.convert_params(1, 0.9, 1)
     nbinom_cdf = nbinom.cdf(2, n, p)
     zinbinom_cdf = sm.distributions.zinegbin.cdf(2, 1, 0.9, 2, 0)
     assert_allclose(nbinom_cdf, zinbinom_cdf, rtol=1e-12, atol=1e-12)
示例#21
0
 def test_cdf_p2(self):
     n, p = sm.distributions.zinegbin.convert_params(30, 0.1, 2)
     nbinom_cdf = nbinom.cdf(10, n, p)
     zinbinom_cdf = sm.distributions.zinegbin.cdf(10, 30, 0.1, 2, 0)
     assert_allclose(nbinom_cdf, zinbinom_cdf, rtol=1e-12, atol=1e-12)
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(5)
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(5)
fig.suptitle("Distribución Geometrica")
plt.show()

# DISTRIBUCIÓN BINOMIAL NEGATIVA

from scipy.stats import nbinom

nbinom.pmf(k=5, n=2, p=0.1)

nbinom.pmf(k=5, n=2, p=0.1, loc=0)

nbinom.cdf(k=4, n=2, p=0.1)

1 - nbinom.cdf(k=4, n=2, p=0.1)

nbinom.rvs(n=2, p=0.1, size=100)

params = nbinom.stats(n=2, p=0.1, moments='mv')

'E(X) = {} y Var(X) = {}'.format(params[0], params[1])

n, p = 10, 0.25
x = np.arange(nbinom.ppf(0.01, n, p), nbinom.ppf(0.99, n, p))
fig = plt.figure(figsize=(5, 2.7))
ax = fig.add_subplot(1, 2, 1)
ax.plot(x, nbinom.pmf(x, n, p), 'bo', ms=8, label="nbinom pmf")
ax.vlines(x, 0, nbinom.pmf(x, n, p), color="b", lw=5, alpha=0.5)
示例#23
0
 def cdf(self, x: float) -> float:
     k = int(x)
     return float(nbinom.cdf(k, self.r, self.p))
x = np.arange(nbinom.ppf(0.01, n, p), nbinom.ppf(0.99, n, p))
ax.plot(x, nbinom.pmf(x, n, p), 'bo', ms=8, label='nbinom pmf')
ax.vlines(x, 0, nbinom.pmf(x, n, p), colors='b', lw=5, alpha=0.5)

# Alternatively, the distribution object can be called (as a function)
# to fix the shape and location. This returns a "frozen" RV object holding
# the given parameters fixed.

# Freeze the distribution and display the frozen ``pmf``:

rv = nbinom(n, p)
ax.vlines(x,
          0,
          rv.pmf(x),
          colors='k',
          linestyles='-',
          lw=1,
          label='frozen pmf')
ax.legend(loc='best', frameon=False)
plt.show()

# Check accuracy of ``cdf`` and ``ppf``:

prob = nbinom.cdf(x, n, p)
np.allclose(x, nbinom.ppf(prob, n, p))
# True

# Generate random numbers:

r = nbinom.rvs(n, p, size=1000)
示例#25
0
 def Cumulative_den_fun(self, x, p):
     cdf = nbinom.cdf(x, p)
     return cdf
示例#26
0
 def cdf(self, k=0):
     return nbinom.cdf(k, self.r, self.p)
示例#27
0
 def _cdf(self, x, n, p):
     k = floor(x)
     if k == 0:
         return 0.0
     else:
         return (nbinom.cdf(x, n, p) - nbinom.pmf(0, n, p)) / nbinom.sf(0, n, p) 
示例#28
0
[m, p] = [5, 0.35]
N = [5, 10, 100, 1000, 10**5]
fig = plt.figure()

for n in N:
    print("n = ", n)
    for i in range(5):
        P = np.zeros(n)
        maxx = 0
        for iteration in range(n):
            p_it = 0
            counter = 0
            while counter < m:
                xi = np.random.rand()
                if xi <= p:
                    counter += 1
                else:
                    p_it += 1
            P[iteration] = p_it
        if np.max(P) > maxx:
            maxx = np.max(P)
        print(P)
        x, y = data_ECDF(P)
        plt.step([0, *x, 1.1 * x[-1]], [0, *y, 1],
                 label="NB ECDF{h}".format(h=i + 1),
                 where='post')
    xx = range(maxx.astype(int))
    plt.step(xx, nbinom.cdf(xx, m, p), color='k', lw=3, label="NB CDF")
    plt.legend(loc='lower right', frameon=False)
    plt.show()
示例#29
0
def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, minInputQuantile, mode=1):
    '''

    Args:
        prefix: prefix for the output
        bedFile: bin in BED format
        bctFile: fragment insert coverage in BCT format (made from procBam.py)
        covFile: covariates in TSV format
        bwFile: fragment coverage in BigWig format
        chromSize: chromosome sizes
        threshold: threshold to call peak
        minInputQuantile: minimum input coverage

    Returns:
        peak files (original peaks, merged peaks, and centered final peaks)

    '''
    print("[%s] Calling peaks" % (timestamp()))

    ### load data
    print("[%s] Loading fragment coverages, and covariates" % (timestamp()))
    bct = np.loadtxt(bctFile, ndmin=2)  # 0=input, 1=output, 2=normalized input
    cov = np.loadtxt(covFile, ndmin=2)  # 3:=cov

    ### scale covariates to have mean 0 and sd 1
    cov_scaled = preprocessing.scale(cov, axis=0)

    ### merge data
    mat = np.concatenate((bct[:, [1, 0, 2]], cov_scaled), axis=1)  # 0=output, 1=input, 2=normalized input, 3:=cov
    del bct, cov, cov_scaled

    ### non sliding bins
    nonSliding = np.zeros(mat.shape[0], dtype=bool)  ### initialize with False
    with open(bedFile, "r") as bed:
        lastchr, lastbin = "", 0
        for i, bin in enumerate(bed.readlines()):
            if bin.split("\t")[0] != lastchr:
                lastchr = bin.split("\t")[0]
                lastbin = int(bin.split("\t")[2])
                nonSliding[i] = True
            elif int(bin.split("\t")[1]) >= lastbin:
                lastbin = int(bin.split("\t")[2])
                nonSliding[i] = True

    ### remove bins with input count of zero (i.e., untested region) OR extreme values (top 1%, i.e., sequencing artifacts)
    # minInput = np.quantile(mat[(mat[:, 1] > 0), 1], 0.01)
    # maxInput = np.quantile(mat[(mat[:, 1] > 0), 1], 0.99)
    minInput = 0
    maxInput = np.quantile(mat[:, 1], 0.99)
    nonZeroInput = (mat[:, 1] > minInput) & (mat[:, 1] < maxInput)

    # minOutput = np.quantile(mat[:, 0], 0.01)
    # maxOutput = np.quantile(mat[:, 0], 0.99)
    # nonZeroInput = (mat[:, 1] > 0) & (mat[:, 0] > minOutput) & (mat[:, 0] < maxOutput)

    ### remove bins with normalized input count of zero (i.e., untested region) OR below "minimum threshold" defined by minInputQuantile
    # minInput = np.quantile(mat[(mat[:, 1] > 0), 1], float(minInputQuantile))
    # print("[%s] Minimum Input Coverage: %f" % (timestamp(), minInput))
    # nonZeroInput = mat[:, 1] > minInput

    ### calculate fold change
    fc = np.zeros(mat.shape[0])
    fc[mat[:, 1] > 0] = mat[mat[:, 1] > 0, 0] / (mat[mat[:, 1] > 0, 2])

    minOutputThreshold=0.9
    testOutput = mat[:, 0] > np.quantile(mat[:, 0], float(minOutputThreshold))
    # minFC = fc > 1

    ### filtering bins
    print("[%s] Before filtering: %s" % (timestamp(), mat.shape[0]))

    print("[%s] Removing %i bins with insufficient input coverage" % (timestamp(), sum(np.invert(nonZeroInput))))
    print("[%s] Bins with sufficient input coverage: %s" % (timestamp(), mat[nonZeroInput, :].shape[0]))

    print("[%s] Removing %i sliding bins" % (timestamp(), sum(np.invert(nonSliding))))
    print("[%s] Bins with non-sliding window: %s" % (timestamp(), mat[nonSliding, :].shape[0]))

    print("[%s] After filtering: %s" % (timestamp(), mat[nonZeroInput & nonSliding, :].shape[0]))

    ### mode 2 uses "input" as offset variable
    if int(mode) == 2:
        print("[%s] Running Mode 2" % (timestamp()))
        ### remove input
        mat = np.delete(mat, 1, 1)

        ### formula
        x = ["x" + str(i) for i in range(1, mat.shape[1] - 1)]
        df = pd.DataFrame(mat[nonZeroInput & nonSliding, :], columns=["y", "exposure"] + x)
        formula = "y~" + "+".join(df.columns.difference(["y", "exposure"]))
        print("[%s] Fit using formula: %s" % (timestamp(), formula))

        ### Initial parameter estimation using Poisson regression
        # print("[%s] Initial estimate" % (timestamp()))
        model0 = smf.glm(formula, data=df, family=sm.families.Poisson(), offset=np.log(df["exposure"])).fit()
        # print model0.summary()

        ### Estimate theta
        th0, _ = theta(mat[nonZeroInput & nonSliding, :][:, 0], model0.mu)
        print("[%s] Initial estimate of theta is %f" % (timestamp(), th0))

        ### re-estimate beta with theta
        # print("[%s] Re-estimate of beta" % (timestamp()))
        model = smf.glm(formula, data=df, family=sm.families.NegativeBinomial(alpha=1 / th0),
                        offset=np.log(df["exposure"])).fit(start_params=model0.params)
        # print model.summary()

        ### Re-estimate theta
        th, _ = theta(mat[nonZeroInput & nonSliding, :][:, 0], model.mu)
        print("[%s] Re-estimate of theta is %f" % (timestamp(), th))

        ### predict
        print("[%s] Predicting expected counts for bins above a minimum threshold: %s" % (
            timestamp(), mat[nonZeroInput & testOutput, :].shape[0]))
        df = pd.DataFrame(mat[nonZeroInput & testOutput, :], columns=["y", "exposure"] + x)
        y_hat = model.predict(df, offset=np.log(df["exposure"]))

    ### mode 1 uses "input" as covariate (default):
    else:
        print("[%s] Running Mode 1" % (timestamp()))
        ### remove normalized input
        mat = np.delete(mat, 2, 1)

        ### formula
        x = ["x" + str(i) for i in range(1, mat.shape[1])]
        df = pd.DataFrame(mat[nonZeroInput & nonSliding, :], columns=["y"] + x)
        formula = "y~" + "+".join(df.columns.difference(["y"]))
        print("[%s] Fit using formula: %s" % (timestamp(), formula))

        ### Initial parameter estimation using Poisson regression
        # print("[%s] Initial estimate" % (timestamp()))
        model0 = smf.glm(formula, data=df, family=sm.families.Poisson()).fit()
        # print model0.summary()

        ### Estimate theta
        th0, _ = theta(mat[nonZeroInput & nonSliding, :][:, 0], model0.mu)
        print("[%s] Initial estimate of theta is %f" % (timestamp(), th0))

        ### re-estimate beta with theta
        # print("[%s] Re-estimate of beta" % (timestamp()))
        model = smf.glm(formula, data=df, family=sm.families.NegativeBinomial(alpha=1 / th0)).fit(
            start_params=model0.params)
        # print model.summary()

        ### Re-estimate theta
        th, _ = theta(mat[nonZeroInput & nonSliding, :][:, 0], model.mu)
        print("[%s] Re-estimate of theta is %f" % (timestamp(), th))

        ### predict
        print("[%s] Predicting expected counts for bins above a minimum threshold: %s" % (
            timestamp(), mat[nonZeroInput & testOutput, :].shape[0]))
        df = pd.DataFrame(mat[nonZeroInput & testOutput, :], columns=["y"] + x)
        y_hat = model.predict(df)

    ### calculate P-value
    print("[%s] Calculating P-value" % (timestamp()))
    theta_hat = np.repeat(th, len(y_hat))
    prob = th / (th + y_hat)  ### prob=theta/(theta+mu)
    pval = 1 - nbinom.cdf(mat[nonZeroInput & testOutput, 0] - 1, n=theta_hat, p=prob)
    del mat

    ### multiple testing correction
    print("[%s] Multiple testing correction" % (timestamp()))
    _, pval_adj, _, _ = multi.multipletests(pval, method="fdr_bh")

    p_score = -np.log10(pval)
    q_score = -np.log10(pval_adj)

    ### output peak
    with open(prefix + ".peak.bed", "w") as out:
        with open(bedFile, "r") as bed:
            for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & testOutput))):
                if pval[i] <= float(threshold):
                    out.write("%s\t%.3f\t%.3f\t%.3f\t%.5e\t%.5e\n" % (
                        bin.strip(), fc[nonZeroInput & testOutput][i], p_score[i], q_score[i], pval[i], pval_adj[i]))

    ### output p-val track
    print("[%s] Generating P-value bedGraph" % (timestamp()))
    with open(prefix + ".pval.bdg", "w") as out:
        with open(bedFile, "r") as bed:
            for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & testOutput))):
                out.write("%s\t%.3f\n" % (bin.strip(), abs(p_score[i])))
    del p_score, q_score, pval, pval_adj

    ### make bigWig track
    print("[%s] Making BigWig tracks" % (timestamp()))
    make_bigwig(prefix=prefix, bedFile=bedFile, bctFile=bctFile, chromSize=chromSize, bedGraphFile=prefix + ".pval.bdg")
    safe_remove(prefix + ".pval.bdg")

    ### merge peak
    print("[%s] Merge peaks" % (timestamp()))
    pybedtools.BedTool(prefix + ".peak.bed").merge(c=[4, 5, 6, 7, 8], o=["max", "max", "max", "min", "min"]).saveas(
        prefix + ".peak.merged.bed")

    ### center merged peak
    print("[%s] Finalizing peaks" % (timestamp()))
    center_peak(bwFile=bwFile,
                peakFile=prefix + ".peak.merged.bed",
                centeredPeakFile=prefix + ".peak.final.bed")

    print("[%s] Done" % (timestamp()))
示例#30
0
def call_peak(prefix, bedFile, bctFile, chromSize, bwFile, covFile=None, threshold=0.05, mode=1, minCoverage=10, extQuantile=1e-5):
    '''

    calls peak

    Args:
        (Required)
        prefix: prefix for the output
        bedFile: bin in BED format
        bctFile: fragment insert coverage in BCT format (made from procBam.py)
        chromSize: chromosome sizes
        bwFile: fragment coverage in BigWig format

        (Optional)
        covFile: covariates in TSV format
        threshold: threshold to call peak
        mode: 1 - using input as covariate 2 - using input as offset
        minCoverage: minimum coverage required for peak
        extQuantile: for removing genomic bins having extreme quantile (sequencing artifact)

    Returns:
        peak files (original peaks and center-merged final peaks)
        peak format: chr, start, end, foldChg, inputCt, outputCt, pval, qval

    '''
    print("[%s] Calling peaks" % (timestamp()))

    ### load data
    print("[%s] Loading fragment coverages" % (timestamp()))
    bct = np.loadtxt(bctFile, ndmin=2)  # BCT 0=input, 1=output, 2=normalized input

    if covFile:
        print("[%s] Loading covariates" % (timestamp()))
        cov = np.loadtxt(covFile, ndmin=2)  # COV 3:=cov

        ### scale covariates to have mean 0 and sd 1
        cov_scaled = preprocessing.scale(cov, axis=0)

        ### merge data
        mat = np.concatenate((bct[:, [1, 0, 2]], cov_scaled), axis=1)  # MAT 0=output, 1=input, 2=normalized input, 3:=cov

        del cov, cov_scaled
    else:
        print("[%s] Running without covariates" % (timestamp()))
        ### merge data
        mat = bct[:, [1, 0, 2]]  # MAT 0=output, 1=input, 2=normalized input

    del bct

    bct_o = mat[:, 0]
    bct_i = mat[:, 1]
    bct_n = mat[:, 2]

    ### non sliding bins
    nonSliding = np.zeros(mat.shape[0], dtype=bool)  ### initialize with False
    with open(bedFile, "r") as bed:
        lastchr, lastbin = "", 0
        for i, bin in enumerate(bed.readlines()):
            if bin.split("\t")[0] != lastchr:
                lastchr = bin.split("\t")[0]
                lastbin = int(bin.split("\t")[2])
                nonSliding[i] = True
            elif int(bin.split("\t")[1]) >= lastbin:
                lastbin = int(bin.split("\t")[2])
                nonSliding[i] = True

    ### remove bins with input count of zero (i.e., untested region) OR extreme values (top 0.001%, i.e., sequencing artifacts)

    minInput = minCoverage
    maxInput = np.quantile(bct_i[bct_i > 0], (1 - extQuantile))
    # print(minInput, maxInput)

    minOutput = minCoverage
    maxOutput = np.quantile(bct_o[bct_o > 0], (1 - extQuantile))
    # print(minOutput, maxOutput)

    ### calculate fold change
    fc = np.zeros(mat.shape[0])
    fc[bct_n > 0] = bct_o[bct_n > 0] / (bct_n[bct_n > 0])  ### fc = output / normalized_input

    ### train / test genomic bin ###
    trainingBin = (bct_i > minInput) & (bct_i < maxInput) & (bct_o > minOutput) & (bct_o < maxOutput) & nonSliding
    testingBin = (bct_i > minInput) & (bct_i < maxInput) & (fc > 1.5)  ### bins w/ FC > 1.5 are tested for statistical significance

    ### filtering bins
    print("[%s] Total genomic bins: %s" % (timestamp(), '{:,}'.format(mat.shape[0])))
    print("[%s] Total non-overlapping genomic bins: %s" % (timestamp(), '{:,}'.format(sum(nonSliding))))

    print("[%s] Removing bins with input counts larger than %s for training and testing" % (timestamp(), '{:,}'.format(maxInput)))
    print("[%s] Removing bins with output counts larger than %s for training" % (timestamp(), '{:,}'.format(maxOutput)))

    print("[%s] Total genomic bins used for training: %s" % (timestamp(), '{:,}'.format(mat[trainingBin, :].shape[0])))
    print("[%s] Total genomic bins used for testing: %s" % (timestamp(), '{:,}'.format(mat[testingBin, :].shape[0])))

    ### mode 2 uses "input" as offset variable
    if int(mode) == 2:
        print("[%s] Running Mode 2" % (timestamp()))

        ### remove input
        mat_model = np.delete(mat, 1, 1)

        ### formula
        x = ["x" + str(i) for i in range(1, mat_model.shape[1] - 1)]
        df = pd.DataFrame(mat_model[trainingBin, :], columns=["y", "exposure"] + x)
        formula = "y~" + "+".join(df.columns.difference(["y", "exposure"]))
        print("[%s] Fit using formula: %s" % (timestamp(), formula))

        ### Initial parameter estimation using Poisson regression
        # print("[%s] Initial estimate" % (timestamp()))
        model0 = smf.glm(formula, data=df, family=sm.families.Poisson(), offset=np.log(df["exposure"])).fit()
        # print model0.summary()

        ### Estimate theta
        th0, _ = theta(mat_model[trainingBin, :][:, 0], model0.mu)
        print("[%s] Initial estimate of theta is %f" % (timestamp(), th0))

        ### re-estimate beta with theta
        # print("[%s] Re-estimate of beta" % (timestamp()))
        model = smf.glm(formula, data=df, family=sm.families.NegativeBinomial(alpha=1 / th0), offset=np.log(df["exposure"])).fit(start_params=model0.params)
        # print model.summary()

        ### Re-estimate theta
        th, _ = theta(mat_model[trainingBin, :][:, 0], model.mu)
        print("[%s] Re-estimate of theta is %f" % (timestamp(), th))

        ### predict
        print("[%s] Predicting expected counts" % (timestamp()))

        df = pd.DataFrame(mat_model[testingBin, :], columns=["y", "exposure"] + x)
        y_hat = model.predict(df, offset=np.log(df["exposure"]))

    ### mode 1 uses "input" as covariate (default):
    else:
        print("[%s] Running Mode 1" % (timestamp()))

        ### remove normalized input
        mat_model = np.delete(mat, 2, 1)

        ### formula
        x = ["x" + str(i) for i in range(1, mat_model.shape[1])]
        df = pd.DataFrame(mat_model[trainingBin, :], columns=["y"] + x)
        formula = "y~" + "+".join(df.columns.difference(["y"]))
        print("[%s] Fit using formula: %s" % (timestamp(), formula))

        ### Initial parameter estimation using Poisson regression
        # print("[%s] Initial estimate" % (timestamp()))
        model0 = smf.glm(formula, data=df, family=sm.families.Poisson()).fit()
        # print model0.summary()

        ### Estimate theta
        th0, _ = theta(mat_model[trainingBin, :][:, 0], model0.mu)
        print("[%s] Initial estimate of theta is %f" % (timestamp(), th0))

        ### re-estimate beta with theta
        # print("[%s] Re-estimate of beta" % (timestamp()))
        model = smf.glm(formula, data=df, family=sm.families.NegativeBinomial(alpha=1 / th0)).fit(start_params=model0.params)
        # print model.summary()

        ### Re-estimate theta
        th, _ = theta(mat_model[trainingBin, :][:, 0], model.mu)
        print("[%s] Re-estimate of theta is %f" % (timestamp(), th))

        ### predict
        print("[%s] Predicting expected counts" % (timestamp()))
        df = pd.DataFrame(mat_model[testingBin, :], columns=["y"] + x)
        y_hat = model.predict(df)

    ### calculate P-value
    print("[%s] Calculating P-value" % (timestamp()))
    theta_hat = np.repeat(th, len(y_hat))
    prob = th / (th + y_hat)  ### prob=theta/(theta+mu)
    pval = 1 - nbinom.cdf(k=mat_model[testingBin, 0] - 1, n=theta_hat, p=prob)

    ### multiple testing correction
    print("[%s] Multiple testing correction" % (timestamp()))
    _, qval, _, _ = multi.multipletests(pval, method="fdr_bh")

    nlog10pval = -np.log10(pval)
    nlog10qval = -np.log10(qval)

    ### output initial peaks
    print("[%s] Output initial peaks" % (timestamp()))
    with open(prefix + ".peak.bed", "w") as out:
        with open(bedFile, "r") as bed:
            for i, bin in enumerate(list(compress(bed.readlines(), testingBin))):
                if qval[i] <= float(threshold):
                    ### chr, start, end, foldChg, inputCov, outputCov, -log10(pval), -log10(qval)
                    out.write("%s\t%.3f\t%i\t%i\t%.3f\t%.3f\n" % (bin.strip(), fc[testingBin][i], mat[testingBin, 1][i], mat[testingBin, 0][i], nlog10pval[i], nlog10qval[i]))

    ### generate various bdg tracks
    print("[%s] Generating bedGraph files" % (timestamp()))
    with open(prefix + ".pval.bdg", "w") as fp, open(prefix + ".qval.bdg", "w") as fq, open(prefix + ".fc.bdg", "w") as ff:
        with open(bedFile, "r") as bed:
            for i, bin in enumerate(bed.readlines()):
                ff.write("%s\t%.3f\n" % (bin.strip(), fc[i]))

        with open(bedFile, "r") as bed:
            for i, bin in enumerate(list(compress(bed.readlines(), testingBin))):
                fp.write("%s\t%.3f\n" % (bin.strip(), nlog10pval[i]))
                fq.write("%s\t%.3f\n" % (bin.strip(), nlog10qval[i]))

    del mat, mat_model, nlog10pval, nlog10qval, pval, qval

    ### make bigWig track
    print("[%s] Making BigWig tracks" % (timestamp()))

    with open(bedFile) as bed:
        c1, s1, e1 = bed.readline().strip().split('\t')
        c2, s2, e2 = bed.readline().strip().split('\t')

    w = int(e1) - int(s1)
    s = int(s2) - int(s1)

    bdg2bw(bdgFile=prefix + ".fc.bdg", bwFile=prefix + ".fc.bw", chromSize=chromSize, window=w, step=s)
    safe_remove(prefix + ".fc.bdg")

    bdg2bw(bdgFile=prefix + ".pval.bdg", bwFile=prefix + ".pval.bw", chromSize=chromSize, window=w, step=s)
    safe_remove(prefix + ".pval.bdg")

    bdg2bw(bdgFile=prefix + ".qval.bdg", bwFile=prefix + ".qval.bw", chromSize=chromSize, window=w, step=s)
    safe_remove(prefix + ".qval.bdg")

    ### center peak
    print("[%s] Center peaks" % (timestamp()))
    center_peak(bwFile=bwFile, peakFile=prefix + ".peak.bed", centeredPeakFile=prefix + ".peak.centered.bed")

    ### merge peak
    print("[%s] Merge peaks" % (timestamp()))
    pybedtools.BedTool(prefix + ".peak.centered.bed").merge(c=[4, 6, 7, 8], o=["max", "max", "max", "max"]).saveas(prefix + ".peak.centered.merged.bed")

    ### finalize peak
    print("[%s] Finalize peaks" % (timestamp()))
    peak = pd.read_csv(prefix + ".peak.centered.merged.bed", sep='\t', header=None)
    peak.columns = ['chr', 'start', 'end', 'fc', 'cov', 'nlog10pval', 'nlog10qval']
    peak['idx'] = peak.index
    peak['strand'] = "."
    peak['score'] = [min(int(round(x)), 1000) for x in peak['fc'] * 100]
    peak = peak.sort_values(by=['fc'], ascending=False)
    peak['name'] = ['peak_' + str(x) for x in peak.reset_index().index + 1]
    peak = peak.sort_values(by=['idx'])
    final = peak[['chr', 'start', 'end', 'name', 'score', 'strand', 'fc', 'cov', 'nlog10pval', 'nlog10qval']]
    final.to_csv(prefix + '.peak.final.bed', sep='\t', float_format='%.3f', index=False, header=False)

    ### remove intermediate peak files
    safe_remove(prefix + ".peak.centered.bed")
    safe_remove(prefix + ".peak.centered.merged.bed")

    print("[%s] Done" % (timestamp()))
示例#31
0
 def _cdf(self, x, mu, alpha, p, w):
     s, p = self.convert_params(mu, alpha, p)
     # construct cdf from standard negative binomial cdf
     # and the w inflation of zero
     return w + nbinom.cdf(x, s, p) * (1 - w)
示例#32
0
def nbp_bg_adj(sample_sig_file, input_sig_file, outputname):
    ### read input files
    sample = read2d_array(sample_sig_file, float)
    background = read2d_array(input_sig_file, float)

    ### set threshold to ignore
    thesh = 0

    ### get sample mean & var
    sample_non0 = sample[sample > thesh]
    sample_mean = np.mean(sample_non0)
    sample_var = np.var(sample_non0)

    ### get negative binomial parameters from sample track regions
    sample_prob = sample_mean / sample_var
    if sample_prob < 0.1:
        sample_prob = 0.1
    if sample_prob >= 0.9:
        sample_prob = 0.9

    ### get size parameter for negative binomial distribution p-value (1st round)
    sample_size = sample_mean * sample_prob / (1 - sample_prob)

    ### get background mean & var
    background_non0 = background[background > thesh]
    bg_mean = np.mean(background_non0)
    bg_var = np.var(background_non0)

    print('check input track overdispersion in background regions, var/mean=' +
          str(round(bg_var / bg_mean, 3)))
    print(sample_prob)
    print(sample_size)
    print(len(background_non0))

    print(bg_mean)
    print(bg_var)

    ### 1st round negative binomial p-value
    i = 0
    nb_pval_list = np.empty((0, ), float)
    print(nb_pval_list.shape)
    for sig in sample:
        if i % 10000 == 0:
            print(i)
        i = i + 1
        nb_pval_tmp = np.array(
            1 - nbinom.cdf(sig, sample_size, sample_prob, loc=0))
        nb_pval_list = np.concatenate((nb_pval_list, nb_pval_tmp))

    ############### second round
    ### get sample bg regions
    sample_bg = sample[nb_pval_list >= 0.001, ]
    sample_bg_non0 = sample_bg[sample_bg > thesh]
    sample_bg_mean = np.mean(sample_bg_non0)
    sample_bg_var = np.var(sample_bg_non0)

    print(
        'check signal track overdispersion in background regions, var/mean=' +
        str(round(sample_bg_var / sample_bg_mean, 3)))
    print(sample_bg_mean)
    print(sample_bg_var)
    print(len(sample_bg_non0))

    ### get negative binomial parameters from signal track bg regions
    sample_bg_prob = sample_bg_mean / sample_bg_var
    if sample_bg_prob < 0.1:
        sample_bg_prob = 0.1

    if sample_bg_prob >= 0.9:
        sample_bg_prob = 0.9

    ### get size parameter for negative binomial distribution p-value (2nd round)
    sample_bg_size = sample_bg_mean * sample_bg_prob / (1 - sample_bg_prob)

    ### get background bg regions
    background_bg = background[nb_pval_list >= 0.001, ]
    background_bg_non0 = background_bg[background_bg > thesh]
    background_bg_mean = np.mean(background_bg_non0)
    background_bg_var = np.var(background_bg_non0)

    print('check input track overdispersion in background regions, var/mean=' +
          str(round(background_bg_var / background_bg_mean, 3)))
    print(sig_bg_prob)
    print(sig_bg_size)
    print(len(input_bg_non0))

    print(input_bg_mean)
    print(inpy_bg_var)

    ### 2nd round negative binomial p-value
    i = 0
    nb_pval_list = np.empty((0, ), float)
    for sig, bg in zip(sample, background):
        if i % 100000 == 0:
            print(i)
        i = i + 1
        nb_pval_tmp = np.array(1 - nbinom.cdf(sig,
                                              sample_bg_size * (bg + 1) /
                                              (background_bg_mean + 1),
                                              sample_bg_prob,
                                              loc=0))
        nb_pval_list = np.concatenate((nb_pval_list, nb_pval_tmp))

    ### convert to np array
    nb_pval_list = -np.log10(nb_pval_list)
    nb_pval_list = nb_pval_list.reshape(nb_pval_list.shape[0], 1)

    ### write output
    write2d_array(nb_pval_list, outputname + '.nbp_2r.txt')

    ### info vector
    mvsp = open(outputname + '.mvsp.txt', 'w')
    mvsp.write(str(sample_bg_mean) + '\t')
    mvsp.write(str(sample_bg_var) + '\t')
    mvsp.write(str(sample_bg_size) + '\t')
    mvsp.write(str(sample_bg_prob) + '\n')
    mvsp.close()