Exemplo n.º 1
0
def Glower(theta,y,n,j):
    y=np.asarray(y).copy()
    n=np.asarray(n).copy()
    
    if(j==0): return (binom.sf(k=y[j]-1,n=n[j],p=theta))
    
    return (binom.sf(k=y[j],n=n[j],p=theta)+binom.pmf(k=y[j],n=n[j],p=theta)*Glower(theta=theta,y=y,n=n,j=j-1))
def binom_test_v2(x, n=None, p=0.5, alternative='two-sided'):
    n = np.int_(n)
    if (p > 1.0) or (p < 0.0):
        raise ValueError("p must be in range [0,1]")

    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError(
            "alternative not recognized should be 'two-sided', 'less' or 'greater'"
        )
    if alternative == 'less':
        pval = binom.cdf(x, n, p)
        return pval
    if alternative == 'greater':
        pval = binom.sf(x - 1, n, p)
        return pval
    d = binom.pmf(x, n, p)
    rerr = 1 + 1e-7
    a_fn = lambda x1: binom.pmf(x1, n, p)
    if x == p * n:
        pval = 1.
    elif x < p * n:
        y = n - binary_search(a_fn, d * rerr, np.ceil(p * n), n) + 1
        pval = (binom.cdf(x, n, p) + binom.sf(n - y, n, p))
    else:
        y = binary_search(a_fn, d * rerr, 0, np.floor(p * n) + 1, True) + 1
        pval = (binom.cdf(y - 1, n, p) + binom.sf(x - 1, n, p))
    return min(1.0, pval)
Exemplo n.º 3
0
 def alpha_on_determinist_compound_closed_form(lmb=10.0,t1=10,\
                                             t2=10,l=3,verbose=False):
     alpha_hats = np.arange(0.00001, 1.0, 0.01)
     #alpha_hats = np.array([0.05])
     p = t2 / (t1 + t2)
     alphas = np.zeros(len(alpha_hats))
     k = int(lmb * (t1 + t2))
     alpha_dels = np.ones(len(alpha_hats))
     total_pois_mass = 0.0
     #TODO: Replace this with other condition.
     while sum(alpha_dels) > 1e-7 * len(alpha_dels):
         isfs = binom.isf(alpha_hats, k * l, p)
         cdfs = binom.sf((isfs / l).astype(int), k, p)
         pmf = poisson.pmf(k, lmb * (t1 + t2))
         total_pois_mass += pmf
         alpha_dels = pmf * cdfs
         alphas += alpha_dels
         if verbose and (k - int(lmb * (t1 + t2))) % 100 == 0:
             print("k="+str(k-int(lmb*(t1+t2))) + " alpha_dels sum: "\
                 + str(sum(alpha_dels)))
         k += 1
     if verbose:
         print("Completed first loop")
     k = int(lmb * (t1 + t2)) - 1
     while k >= 0:
         isfs = binom.isf(alpha_hats, k * l, p)
         cdfs = binom.sf((isfs / l).astype(int), k, p)
         pmf = poisson.pmf(k, lmb * (t1 + t2))
         total_pois_mass += pmf
         alpha_dels = pmf * cdfs
         if np.isnan(sum(alpha_dels)):
             print(k)
         alphas += alpha_dels
         k -= 1
     return alphas, alpha_hats, total_pois_mass
Exemplo n.º 4
0
def expctd_cond_gr_m(m, n, p):
    if m > int(n / 2):
        return sum(binom.pmf(np.arange(m+1,n+1),n,p)\
            /binom.sf(m,n,p)*np.arange(m+1,n+1))
    else:
        return n*p/binom.sf(m,n,p)-binom.cdf(m,n,p)\
                /binom.sf(m,n,p)*expctd_cond_leq_m(m,n,p)
def checktheory(thres, n, ne, p, te, s):
    # calculate failure probability for a certain number of ones in se
    pfaildict = {}
    for se in range(0, n + 1):
        tmp = [binom.sf((thres + i + se) / 2, se, p=0.5) * s[i] for i in s]
        tmp2 = [binom.sf((thres + 1 + i + se) / 2, se, p=0.5) * s[i]
                for i in s]
        pfail = 1.5 * sum(tmp) + 0.5 * sum(tmp2)
        pfaildict[se] = pfail

    # set everything to zero
    fail = 0
    fail2 = {}
    for te1 in te:
        fail2[te1] = 0

    # loop over all norm values
    for l1, l2 in tqdm(itertools.combinations_with_replacement(range(0, n + 1), 2), leave=False, total=n * (n + 1) / 2):
        # probability of a certain norm
        pl1 = binom.pmf(l1, n=n, p=p)
        pl2 = binom.pmf(l2, n=n, p=p)
        pl = pl1 * pl2
        if l1 != l2:
            pl *= 2

        # skip if probability is too small
        if pl < 2**-200:
            continue

        # calculate the probability of a failure
        failtmp = 0
        # loop over all possible number of nonzero elements in se
        for se1 in range(max(0, l1 + l2 - n), min(l1, l2) + 1):
            # probability of number of nonzero elements in se
            pse = hypergeom.pmf(k=se1, M=n, n=l1, N=l2)
            # probability of failure for a certain se1
            pfail = pfaildict[se1]
            # weighted average share
            failtmp += pse * pfail

        # for new model, take error correction into account
        fail += pl * failtmp
        for te1 in te:
            fail2[te1] += pl * LACprob(failtmp, ne, te1)

    new = []
    old = []
    for te1 in te:
        # for old model, take error correction into account
        old.append(LACprob(fail, ne, te1))
        new.append(fail2[te1])

    return new, old
def checktheory(thres, n, ne, p, te, s):
    # calculate failure probability for a certain number of ones in se
    pfaildict = {}

    for se in range(0, n + 1):
        tmp_0 = [binom.sf((thres + i + se) / 2, se, p=0.5) * s[i] for i in s]
        tmp_1_1 = [
            binom.sf((thres + i + se + 1) / 2, se, p=0.5) * s[i] for i in s
        ]
        tmp_1_2 = [
            binom.sf((thres + i + se - 1) / 2, se, p=0.5) * s[i] for i in s
        ]
        pfail = sum(tmp_0) + (sum(tmp_1_1) + sum(tmp_1_2)) * 0.5
        pfaildict[se] = pfail

    # set everything to zero
    fail = 0

    # loop over all norm values
    for l1, l2 in tqdm(itertools.combinations_with_replacement(
            range(0, n + 1), 2),
                       leave=False,
                       total=n * (n + 1) / 2):
        # probability of a certain norm
        # pl = P[||s||2] * P[||c||2]
        pl1 = binom.pmf(l1, n=n, p=p)
        pl2 = binom.pmf(l2, n=n, p=p)
        pl = pl1 * pl2
        if l1 != l2:
            pl *= 2

        # skip if probability is too small
        if pl < 2**-100:  ## 200
            continue

        # calculate the probability of a failure
        failtmp = 0
        # loop over all possible number of nonzero elements in se
        for se1 in range(max(0, l1 + l2 - n), min(l1, l2) + 1):
            # probability of number of nonzero elements in se
            pse = hypergeom.pmf(k=se1, M=n, n=l1, N=l2)
            # probability of failure for a certain se1
            pfail = pfaildict[se1]
            # weighted average share
            failtmp += pse * pfail * 0.5  # failtmp = pb

        # for new model, take error correction into account
        fail += pl * Binom_prob(failtmp, ne,
                                te)  # Binom_prob : 1 - Binom(d, lm, pb)

    return fail
Exemplo n.º 7
0
def pbinom(x, size=1, prob=0.5, lowertail=True, log=False):
    """
    ============================================================================
                                                                        pbinom()
    ============================================================================
    The cumulative distribution function for the binomial distribution.
    You provide a value along the binomial distribution (eg x=3) or array of
    values, and it returns what proportion of values lie below it (the quantile)

    Alternatively, if you select lowertail=False, it returns the proportion of
    values that are above it.

    USAGE:
    dbinom(x, size, prob=0.5, log=False)
    pbinom(x, size, prob=0.5, lowertail=True, log=False)
    qbinom(q, size, prob=0.5, lowertail=True)
    rbinom(n=1, size=1, prob=0.5)

    :param x:       int. or array of ints. The values along the distribution.
    :param size:    int. Number of trials
    :param prob:    float. Probability of a success
    :param lowertail bool. are you interested in what proportion of values lie
                     beneath x?
    :param log:     bool. take the log?
    :return:        an array of quantiles() corresponding to the values in x
    ============================================================================
    """
    if lowertail and not log:
        return binom.cdf(x, n=size, p=prob)
    elif not lowertail and not log:
        return binom.sf(x, n=size, p=prob)
    elif lowertail and log:
        return binom.logcdf(x, n=size, p=prob)
    else:
        return binom.logsf(x, n=size, p=prob)
Exemplo n.º 8
0
def calc_sf_all(v, n, p, prev_best_score=False):
    sf_values = -np.log10(binom.sf(v - 1, n, p))
    sf_values[np.isnan(sf_values)] = 0
    sf_values[np.isinf(sf_values)] = (prev_best_score
                                      if prev_best_score is not False else
                                      max(sf_values[~np.isinf(sf_values)]) * 2)
    return sf_values
Exemplo n.º 9
0
def check(N, p):
    global numfails, numchecks, mu, sigma2
    H = NeuronGroup(1, 'v:1', threshold='False', name='H')
    G = NeuronGroup(N, 'v:1', threshold='False', name='G')
    S = Synapses(H, G, on_pre='v+=w', name='S')
    S.connect(p=p)
    m = len(S)
    low, high = binom.interval(alpha, N, p)
    if p==0:
        low = high = 0
    elif p==1:
        low = high = N
    else:
        i = diff(S.j[:])
        i = i[i<isi_max[p]]
        b = bincount(i, minlength=isi_max[p])[:isi_max[p]]
        if b[0]:
            print 'Major error: repeated indices for N=%d, p=%.3f' % (N, p)
            raise ValueError("Repeated indices")
        isi[p] += b
        num_isi[p] += sum(b)
    q = binom.cdf(low-0.1, N, p)+binom.sf(high+0.1, N, p)
    mu += q
    sigma2 += q*(1-q)
    numchecks += 1
    if m<low or m>high:
        numfails += 1
        return True
    else:
        return False
Exemplo n.º 10
0
def gather_stats_binom_control_muts(t, c, seq, treat_nm, nm, decisions):
  '''
    Filter treatment mutations that can be explained by control freq.
    In practice, this step is most effective for control mutations
    with relatively high frequency => relatively high variance

    Considers all events that occur (fq > 0%) in both control and treatment data
  '''
  fpr_threshold_try1 = 0.10
  for jdx, ref_nt in enumerate(seq):
    c_tot = sum(c[jdx])
    t_tot = sum(t[jdx])
    for kdx in range(len(t[jdx])):
      if kdx == nt_to_idx[ref_nt] or t[jdx][kdx] == 0:
        continue

      c_fq = c[jdx][kdx] / c_tot
      t_fq = t[jdx][kdx] / t_tot
      pval = binom.sf(t[jdx][kdx] - 1, t_tot, c_fq)

      if c_fq > 0:
        decisions['obs_nt'].append(nts[kdx])
        decisions['ref_nt'].append(ref_nt)
        decisions['c_fq'].append(c_fq)
        decisions['c_ct'].append(c[jdx][kdx])
        decisions['t_fq'].append(t_fq)
        decisions['t_ct'].append(t[jdx][kdx])
        decisions['c_tot'].append(c_tot)
        decisions['t_tot'].append(t_tot)
        decisions['idx'].append(jdx)
        decisions['pos'].append(_data.idx_to_pos(jdx, treat_nm))
        decisions['pval'].append(pval)
        decisions['nm'].append(nm)

  return
Exemplo n.º 11
0
def calculate_p_value(ex_bg, ex_mut, in_bg, in_mut, p_bg=0.016):
    pval = 1
    if in_bg >= 10 and ex_bg >= 10 and ex_mut >= 0:
        p = np.divide(in_mut, in_bg)
        p = max(p, p_bg)
        pval = 1 - binom.sf(k=ex_mut, n=ex_bg, p=p)
    return pval
Exemplo n.º 12
0
def binomial_test(n, N, P):
    """Perform binomial test on the observed n being higher than expected.
    Specifically, N residues are at risk and of those there are n mutations
    occurred at the Np residues of interest. Given the background probability of
    a mutation at a specific residue, the p-value is calculated as the probability
    of observing n or greater mutations. Since N is large and n is small,
    it is computationally more efficient to take 1 - Pr(i<=n-1).

    Parameters
    ----------
    n : int
        number of observed mutations
    N : int
        number of residues at risk
    P : float
        background probability that a mutation would occur at a single residue

    Returns
    -------
    pval : np.array
        p-value for binomial test
    """
    if n <= 0:
        return 1.0
    pval = binom.sf(n - 1, N, P)
    return pval
Exemplo n.º 13
0
def pbinom(x, size=1, prob=0.5, lowertail=True, log=False):
    """
    ============================================================================
                                                                        pbinom()
    ============================================================================
    The cumulative distribution function for the binomial distribution.
    You provide a value along the binomial distribution (eg x=3) or array of
    values, and it returns what proportion of values lie below it (the quantile)

    Alternatively, if you select lowertail=False, it returns the proportion of
    values that are above it.

    USAGE:
    dbinom(x, size, prob=0.5, log=False)
    pbinom(x, size, prob=0.5, lowertail=True, log=False)
    qbinom(q, size, prob=0.5, lowertail=True)
    rbinom(n=1, size=1, prob=0.5)

    :param x:       int. or array of ints. The values along the distribution.
    :param size:    int. Number of trials
    :param prob:    float. Probability of a success
    :param lowertail bool. are you interested in what proportion of values lie
                     beneath x?
    :param log:     bool. take the log?
    :return:        an array of quantiles() corresponding to the values in x
    ============================================================================
    """
    if lowertail and not log:
        return binom.cdf(x, n=size, p=prob)
    elif not lowertail and not log:
        return binom.sf(x, n=size, p=prob)
    elif lowertail and log:
        return binom.logcdf(x, n=size, p=prob)
    else:
        return binom.logsf(x, n=size, p=prob)
def calculate_parallelism_qvalues(gene_statistics):

    gene_names = []
    Ls = []
    ns = []
    expected_ns = []

    for gene_name in gene_statistics.keys():
        gene_names.append(gene_name)
        ns.append(gene_statistics[gene_name]['observed'])
        expected_ns.append(gene_statistics[gene_name]['expected'])

    ns = numpy.array(ns)
    expected_ns = numpy.array(expected_ns)
    ntot = ns.sum()
    ps = expected_ns / ntot
    ntots = ntot * numpy.ones_like(ps)

    pvalues = binom.sf(ns - 0.5, ntots, ps)

    qvalues = stats_utils.calculate_qvalues(pvalues)

    qvalue_map = {gene_name: q for gene_name, q in zip(gene_names, qvalues)}
    pvalue_map = {gene_name: p for gene_name, p in zip(gene_names, pvalues)}

    return qvalue_map, pvalue_map
Exemplo n.º 15
0
def l_pod_k_of_n_av(l, n, k, p, q, pod_less_mcs=0):
    """
    See here: https://math.stackexchange.com/questions/3825082/reliability-of-an-l-pod-k-of-n-system?noredirect=1#comment7888736_3825082
    Answer matches with legacy method: pffc_resiliency(3,7,.9,.8,1)==l_pod_k_of_n(3,7,4,0.9,0.8)
    args:
        pod_less_mcs: This is a parameter for the AIR formula. 
                Machines for whom the pod is garunteed to work.
    """
    ceil = np.ceil(n / l)
    flr = ceil - 1
    ## Num of hero and joe pods.
    # A hero pod has one more machine
    # than a joe pod.
    h = int(n - l * flr)
    j = int(l * ceil - n)
    prob = 0.0
    ## All combinations of hero and joe pod availability.
    for h1 in range(h + 1):
        for j1 in range(j + 1):
            #Num of available machines with these many pods.
            nn = h1 * ceil + j1 * flr + pod_less_mcs
            if nn >= k:
                ## We get a k of nn system among the machines.
                prob += binom.pmf(h1, h, q) * binom.pmf(j1, j, q) * binom.sf(
                    k - 1, nn, p)
    return prob
Exemplo n.º 16
0
    def reducer(self, key, values):
        friend = key[0]
        fu = key[1]
        ru = key[2]
        hist = {} # asumed to be very small relative to fs
        fs = [] # hopefully not too too big, maybe in the hundreds of thousands.  

        for follower, ruu in values:
            fs.append((follower,ruu))
            if ruu in hist:
                hist[ruu] = hist[ruu] + 1
            else:
                hist[ruu] = 1

        cdf = {}
        for k,v in hist.iteritems():
            cdf[k] = v
            for k2, v2 in hist.iteritems():
                if k2 < k:
                    cdf[k] = cdf[k] + v2

        for follower, ruu in fs:
            edgeprob = 0                
            if not fu == 0:
                edgeprob = max(1,(fu//cdf[ruu]) * binom.sf(ruu,ru,1//fu))
            yield None, ('%d %d %f ' % (friend, follower, edgeprob))
Exemplo n.º 17
0
def binomial_test(n, N, P):
    """Perform binomial test on the observed n being higher than expected.
    Specifically, N residues are at risk and of those there are n mutations
    occurred at the Np residues of interest. Given the background probability of
    a mutation at a specific residue, the p-value is calculated as the probability
    of observing n or greater mutations. Since N is large and n is small,
    it is computationally more efficient to take 1 - Pr(i<=n-1).

    Parameters
    ----------
    n : int
        number of observed mutations
    N : int
        number of residues at risk
    P : float
        background probability that a mutation would occur at a single residue

    Returns
    -------
    pval : np.array
        p-value for binomial test
    """
    if n <= 0:
        return 1.0
    pval = binom.sf(n-1, N, P)
    return pval
Exemplo n.º 18
0
def binomial_p(x, n, p, alternative='greater'):
    """
    Parameters
    ----------
    x : array-like
       list of elements consisting of x in {0, 1} where 0 represents a failure and
       1 represents a seccuess
    p : int
       hypothesized number of successes in n trials
    n : int
       number of trials 
    alternative : {'greater', 'less', 'two-sided'}
       alternative hypothesis to test (default: 'greater')
    Returns
    -------
    float
       estimated p-value 
    """

    assert alternative in ("two-sided", "less", "greater")
    if n < x:
        raise ValueError(
            "Cannot observe more successes than the population size")

    plower = binom.cdf(x, n, p)
    pupper = binom.sf(x - 1, n, p)
    if alternative == 'two-sided':
        pvalue = 2 * np.min([plower, pupper, 0.5])
    elif alternative == 'greater':
        pvalue = pupper
    elif alternative == 'less':
        pvalue = plower
    return pvalue
def testScipy(N, p, outdir):
    fig, ax = plt.subplots(1, 1)
    x = list(range(0, N + 1))
    ax.plot(x, binom.pmf(x, N, p), 'bo', ms=8, label='binom pmf')
    ax.plot(x, binom.cdf(x, N, p), 'ro', ms=8, label='binom cdf')
    ax.plot(x, binom.sf(x, N, p), 'go', ms=8, label='binom sf')
    ax.legend(loc='best', frameon=False)
    plt.savefig(outdir + "/distributions.png")
Exemplo n.º 20
0
def value_func(value, round_left):
    sf = binom.sf(value, n, p)
    if round_left == 1: return sf
    prob = sf
    for _ in range(round_left - 1):
        prob = prob + sf - prob * sf
    assert prob <= 1  # this must always hold
    return prob
Exemplo n.º 21
0
def binominal_backtest(failures, conf=0.05):
    """
    Binominal backtest. Implementation based on on https://rdrr.io/cran/Dowd/src/R/BinomialBacktest.R
    """
    size = failures.shape[0]
    failures = np.sum(failures)
    if failures >= size * conf:
        return binom.sf(failures - 1, size, conf)
    return binom.cdf(failures, size, conf)
Exemplo n.º 22
0
def getMultiplePsFdr(iva, ivb, model, N, win=6):
    """
    for the interval a and b, searching its nearby windows to estimate FDR and p-values. THe idea that using  matched nearby windows, which could have similar distance with a & b, needs too many windows. 
    return ra, rb, rab, es, fdr, hyp, chyp, pop, nbp
    """
    ra, rb, rab = getPETsforRegions(iva, ivb, model)
    #simple hypergeometric test, the idea using cis_a + cis_b + trans_a+trans_b as M and cis_a+cis_b as N fails with all p-value as 1
    hyp = hypergeom.sf(rab - 1.0, N, ra, rb)
    ivas, ivbs = getNearbyPairRegions(iva, ivb, win=win)
    hyps, rabs, nbps = [], [], []
    for na in ivas:
        nraSource = getCounts(na, model[0])
        nraTarget = getCounts(na, model[1])
        nra = nraSource.union(nraTarget)
        nralen = float(len(nra))
        if nralen < 1:
            continue
        for nb in ivbs:
            nrbSource = getCounts(nb, model[0])
            nrbTarget = getCounts(nb, model[1])
            nrb = nrbSource.union(nrbTarget)
            nrblen = len(nrb)
            if nrblen < 1:
                continue
            nrab = float(len(nra.intersection(nrb)))
            #nrab = float(len(nraSource.intersection(nrbTarget)))
            #collect the value for poisson test
            rabs.append(nrab)
            #collect the nearby hypergeometric test result
            nhyp = hypergeom.sf(nrab - 1.0, N, nralen, nrblen)
            hyps.append(nhyp)
            #collect the possibility for following binomal test
            den = nrab / (nralen * nrblen)
            nbps.append(den)
    if len(rabs) == 0:
        return ra, rb, rab, np.inf, 0.0, hyp, 0.0, 0.0, 0.0,
    hyps, rabs = np.array(hyps), np.array(rabs)
    #local fdr
    fdr = len(rabs[rabs > rab]) / float(len(rabs))
    mrabs = float(np.mean(rabs))
    #enrichment score
    if mrabs > 0:
        es = rab / mrabs
    else:
        es = np.inf
    #es = rab / max([np.mean(rabs),float(np.percentile(rabs,90))])
    #es = rab / float(np.percentile(rabs,90))
    #corrected hypergeometric fdr
    chyp = len(hyps[hyps < hyp]) / float(len(hyps))
    #simple possion test, the idea benefits from MACS as using dynamic lambda
    lam = mrabs
    pop = poisson.sf(rab - 1.0, lam)
    #simple binomal test
    bp = np.mean(nbps) * ra * rb / N
    #nbp = binom.sf(rab, N, bp)
    nbp = binom.sf(rab - 1.0, N - rab, bp)
    return ra, rb, rab, es, fdr, hyp, chyp, pop, nbp
Exemplo n.º 23
0
 def pval(self, k):
     """Return the p-value corresponding to k, defined as 1 - cdf(k)."""
     if np.array_equal(self.p, self.p[0] * np.ones(self.p.shape)):
         # I all probabilities are equal, it returns the Binomial pvalue (as it should...)
         return binom.sf(k - 1, self.n, self.p[0])
     elif k > 0:
         return 1. - self.cdf(k - 1)
     else:
         return 1.
Exemplo n.º 24
0
 def _compute_p_values(self, betas):
     """
     Compute p-values of each predictor for Statistical Test of Variable Selection.
     """
     # d_j: non-zero of j-th beta
     d_j = (betas != 0).sum(axis=1)
     # pi: the average of the selcetion ratio of all predictor variables in B boostrap samples.
     pi = d_j.sum() / betas.size
     return binom.sf(d_j - 1, n=self.B, p=pi)
Exemplo n.º 25
0
def get_m_n_from_bernoulli(N):
    p, P_B = 0.05, 0.05
    m_n_bernoulli = np.arange(1, N) * np.nan
    for n in np.arange(1, N):
        x = np.arange(binom.ppf(0.00, n, p), binom.ppf(1.00, n, p))
        prob = binom.sf(x, n, p)
        m = find_m(prob, P_B)
        m_n_bernoulli[n - 1] = m * 1. / n
    return (m_n_bernoulli)
Exemplo n.º 26
0
def testPeaks(degFN, dForm, allGeneInfo, gForm, switchStrand = False):

    #load/configure gene Info
    gNX = Nexus(allGeneInfo, gForm)
    gNX.load(['geneName', 'numReads', 'numSpots'])
    
    gName_numReads = {}
    gName_numSpots = {}
    while gNX.nextID():
        gName_numReads[gNX.geneName] = gNX.numReads
        gName_numSpots[gNX.geneName] = gNX.numSpots
   
   
    #load degFN info
    dNX = Nexus(degFN, dForm)
    dNX.load(['tcc', 'eLevel', 'geneNames', 'pValBin'])

    while dNX.nextID():
       
        gNames, readsForPeak = dNX.geneNames, dNX.eLevel
        chrom, strand, start, end = bioLibCG.tccSplit(dNX.tcc)
        if switchStrand:
            strand = -int(strand)
      
        pVals = []
        for gName in gNames:
            
            #may have to change gene name cuz of multiple spans
            try:
                totGeneReads = gName_numReads[gName]
                numSpotsForGene = gName_numSpots[gName]
            except KeyError:

                try:
                    gName = gName + '_RE_%s_%s' % (chrom, strand)
                    totGeneReads = gName_numReads[gName]
                    numSpotsForGene = gName_numSpots[gName]
                except KeyError:
                    print "FIX THIS GENE NAME", gName
                    continue

            #add psuedocount
            totGeneReads += 1
            numSpotsForGene += 1 # not sure whether to do this yet...

            #check for hidden intron gene overlap
            try:
                q = 1.0/numSpotsForGene
            except ZeroDivisionError:
                continue #intron gene

            #add p val
            pVals.append(binom.sf(readsForPeak, totGeneReads, q))

        dNX.pValBin = max(pVals) if pVals else -1.0

    dNX.save()
Exemplo n.º 27
0
 def getEfficiencyGrid(self, sample, hitsPdf):
     eff = TH2D(sample + '_eff', sample + '_eff', 1000, 0, 100, 5, 0, 5)
     eff.Sumw2()
     eff.SetDirectory(0)
     for ix in range(hitsPdf.GetNbinsX()):
         chargeEfficiency = float(hitsPdf.Integral(ix+1, -1)) # already normalized
         for iy in range(5):
             p = binom.sf(iy-1, 4, chargeEfficiency) # 1 - cdf
             eff.SetBinContent(ix+1, iy+1, p)
     return eff
Exemplo n.º 28
0
 def _compute_p_values(self, betas):
     """
     Compute p-values of each predictor for Statistical Test of Variable Selection.
     """
     not_null = ~np.isnan(betas)
     # d_j: non-zero and notnull of j-th beta
     d_j = np.logical_and(not_null, betas != 0).sum(axis=1)
     # pi: the average of the selcetion ratio of all predictor variables in B boostrap samples.
     pi = d_j.sum() / not_null.sum().sum()
     return binom.sf(d_j - 1, n=self.B, p=pi)
Exemplo n.º 29
0
def get_diff_pvalues_poisson(
        ref_matrix: ExpMatrix, comp_matrix: ExpMatrix) -> pd.Series:

    genes = ref_matrix.genes & comp_matrix.genes

    ref_num_transcripts = ref_matrix.median_transcript_count
    comp_num_transcripts = comp_matrix.median_transcript_count
    num_transcripts = (ref_num_transcripts + comp_num_transcripts) / 2.0

    ref_matrix = ref_matrix.scale(num_transcripts)
    comp_matrix = comp_matrix.scale(num_transcripts)

    ref_matrix = ref_matrix.loc[genes]
    comp_matrix = comp_matrix.loc[genes]

    ref_num_cells = ref_matrix.num_cells
    comp_num_cells = comp_matrix.num_cells

    expressed = ((ref_matrix.sum(axis=1) + comp_matrix.sum(axis=1)) > 0)
    ref_matrix = ref_matrix.loc[expressed]
    comp_matrix = comp_matrix.loc[expressed]

    genes = ref_matrix.index.copy()
    num_genes = len(genes)
    pvals = np.ones(num_genes, dtype=np.float64)
    for i in range(num_genes):

        k1 = ref_matrix.iloc[i, :].sum()
        k2 = comp_matrix.iloc[i, :].sum()

        k = k1 + k2

        # make sure k is integer using ceil()
        k_ceil = int(ceil(k))

        # calculate factor and adjust k2
        f = k_ceil / k
        k2 *= f

        # make sure k1 is integer using floor()
        # this is results in slightly conservative p-values
        k2_floor = int(floor(k2))

        # calculate p of the binomal by taking n1 and n2 into account
        p = comp_num_cells / (ref_num_cells + comp_num_cells)

        # what is the probability of getting k or greater (out of n) randomly?        
        # calculate lower tail: 
        pvals[i] = binom.sf(k2_floor-1, k_ceil, p)

    pvals[pvals == 0] = np.nextafter(0, 1)
    # convert to series
    pvals = pd.Series(index=genes, data=pvals, name='pval')
    pvals.index.name = 'gene'
    return pvals
Exemplo n.º 30
0
def calc_sf_all(v, n, p):
    sf_values = -np.log10(binom.sf(v, n, p))
    sf_values[np.isinf(sf_values)] = 0
    return sf_values


# def multimap(n, func, it, **kw):
#     if n == 0:
#         try:
#             n = cpu_count()
#         except NotImplementedError:
#             n = 1
#     # if n == 1:
#     #     for s in it:
#     #         result = func(s, best_res, **kw)
#     #         if result:
#     #             for x in result:
#     #                 peptide, m, snp_label, res = x

#     #                 for score, spec_t, c, info in res:
#     #                     if -score <= best_res.get(spec_t, 0):
#     #                         best_res_raw[spec_t] = [peptide, m, snp_label, score, spec_t, c, info]
#     #                         best_res[spec_t] = -score
#     #     return best_res_raw, best_res

#     else:

#         qout = Queue()
#         count = 0

#         while True:
#             qin = list(islice(it, 5000000))
#             if not len(qin):
#                 break
# #           print 'Loaded 500000 items. Ending cycle.'
#             procs = []
#             for proc_num in range(n):
#                 p = Process(target=worker, args=(qin, qout, proc_num, n, best_res, best_res_raw))
#                 p.start()
#                 procs.append(p)

#             count = len(qin)

#             for _ in range(n):
#                 for item in iter(qout.get, None):
#                     for k, v in item.items():
#                         if -v[3] <= best_res.get(k, 0):
#                             best_res_raw[k] = v
#                             best_res[k] = -v[3]
#                     # yield item

#             for p in procs:
#                 p.join()

#         return best_res_raw, best_res
Exemplo n.º 31
0
def plot_cummulative_over_rangeV(rO,N,rV):
        pH=[]
        pB=[]
        O=N*rO
        #print("N, O: ",N,", ",O,". Varying V:")
        for rVi in rV:
            p = math.floor(rVi/2) + 1
            pH.append(100*hypergeom.sf(p, N, O, rVi))
            #print(O," --> ", hypergeom.sf(p, N, O, rVi))
            pB.append(100*binom.sf(p,rVi,rO))
        return (pH,pB)        
Exemplo n.º 32
0
def plot_cummulative_over_rangeVmin(rO,N,rVmin):
        pH=[]
        pB=[]
        O=N*rO
    
        for rVi in rVmin:
            pmin = math.floor(rVi/2) + 1
            pH.append(hypergeom.sf(pmin, N, O, rVi))
            #print(O," --> ", hypergeom.sf(pmin, N, O, rVi))
            pB.append(binom.sf(pmin,rVi,rO))
        return (pH,pB)        
Exemplo n.º 33
0
def binomialTailTest(counts, nTrials, pEvent, oneSided=True):
  
  counts = array(counts)
  
  mean = nTrials * pEvent
  
  if oneSided:
    result = zeros(counts.shape)
    isAboveMean = counts > mean
    aboveIdx = isAboveMean.nonzero()
    belowIdx = (~isAboveMean).nonzero()
    result[aboveIdx] = binom.sf(counts[aboveIdx]-1, nTrials, pEvent)
    result[belowIdx] = binom.cdf(counts[belowIdx], nTrials, pEvent)
    
  else:
    diffs = abs(counts-mean)
    result = binom.cdf(mean-diffs, nTrials, pEvent)
    result += binom.sf(mean+diffs-1, nTrials, pEvent)
    
  return result
Exemplo n.º 34
0
def getMultiplePsFdr(iva, ivb, model, N, win=5):
    """
    for the interval a and b, searching its nearby windows to estimate FDR and p-values.  
    return ra, rb, rab, es,es_ra,es_rb, fdr, hyp, pop, nbp
    """
    ra, rb, rab = getPETsforRegions(iva, ivb, model)
    hyp = max([1e-300, hypergeom.sf(rab - 1.0, N, ra, rb)])
    ivas, ivbs = getNearbyPairRegions(iva, ivb, win=win)
    #nras is a list for storing points ids for permutated regions
    nras, nrbs = [], []
    for na in ivas:
        nraSource = getCounts(na, model[0])
        nraTarget = getCounts(na, model[1])
        nra = nraSource.union(nraTarget)
        nras.append(nra)
    for nb in ivbs:
        nrbSource = getCounts(nb, model[0])
        nrbTarget = getCounts(nb, model[1])
        nrb = nrbSource.union(nrbTarget)
        nrbs.append(nrb)
    #caculating the permutated background
    rabs, nbps = [], []
    for nra in nras:
        nralen = float(len(nra))
        for nrb in nrbs:
            nrblen = len(nrb)
            nrab = float(len(nra.intersection(nrb)))
            if nrab > 0:
                #collect the value for poisson test
                rabs.append(nrab)
                #collect the possibility for following binomial test
                den = nrab / (nralen * nrblen)
                nbps.append(den)
            else:
                nbps.append(0.0)
                rabs.append(0.0)
    if len(rabs) == 0:
        return ra, rb, rab, np.inf, 0.0, hyp, 0.0, 1e-300, 1e-300,
    rabs = np.array(rabs)
    #local fdr
    fdr = len(rabs[rabs > rab]) / float(len(rabs))
    mrabs = float(np.mean(rabs))
    #enrichment score
    if mrabs > 0:
        es = rab / np.mean(rabs[rabs > 0])
    else:
        es = np.inf
    #simple possion test
    lam = mrabs
    pop = max([1e-300, poisson.sf(rab - 1.0, lam)])
    #simple binomial test
    bp = np.mean(nbps) * ra * rb / N
    nbp = max([1e-300, binom.sf(rab - 1.0, N - rab, bp)])
    return ra, rb, rab, es, fdr, hyp, pop, nbp
Exemplo n.º 35
0
def min_depth(depth, error, threshold=0.98):
    '''  determine the maximum parental depth permitted from both parents
    
    We look at the minimum alternate depth across both parents. We need this to
    be low. This function determines how low we can set this and still capture
    the vast majority of true candidate de novo mutations.
    
    There are four posible scenarios:
        1) both parents have depths below or equal to the depth
        2) the first parent exceeds the depth but the second parent does not
        3) the second parent exceeds the depth but the first parent does not
        4) both parents exceed the depth.
    
    We only need to consider the fourth scenario. The probability that this
    happens (at a given depth) is 1 - prob(first parent exceeds) * prob(second
    parent exceeds). We repeatedly increment the depth threshold upwards until
    this probability is sufficiently high.
    
    Args:
        depth: depth of parental sequencing. Can either be a single value (as a
            summary value for both parents), or a list of two depths, one for
            each parent.
        error: site-specific error rate (e.g. 0.002)
        threshold: probability threshold that we are wanting to exceed. We look
            for a alt count where the probability exceeds this value. Assuming
            the value is 0.98, then 98% of cases will have a min depth less than
            the returned count.
    
    Returns:
        maximum permitted alternate depth across both parents.
    '''
    
    # convert int variables to a list, so we don't need code specific to ints
    try:
        depth = [int(depth), int(depth)]
    except TypeError:
        pass
    
    # raise an error if the length is not two.
    assert len(depth) == 2
    assert 0.0 < threshold < 1.0
    
    x = 0
    while True:
        product = 1
        for i in depth:
            product *= binom.sf(x, i, error)
        prob = 1 - product
        
        if prob > threshold:
            return(x)
        
        x += 1
Exemplo n.º 36
0
 def _calculate_quasi_p(self, i):
     """Calculates quasi-p values as discussed in Bryant and Lempert (2010).
     
     This is a one sided binomial test.
     
     Parameters
     ----------
     i : int
         the specific box in the peeling trajectory for which the quasi-p 
         values are to be calculated.
     
     Returns
     -------
     the quasi-p value
     """
     box_lim = self._box_lims[i]
     restricted_dims = list(determine_restricted_dims(
             box_lim,
             self.prim._box_init))
     
     # total nr. of cases in box
     Tbox = self.peeling_trajectory['mass'][i] * self.prim.n 
     
     # total nr. of cases of interest in box
     Hbox = self.peeling_trajectory['coverage'][i] * self.prim.t_coi  
     
     qp_values = {}
     
     for u in restricted_dims:
         temp_box = copy.deepcopy(box_lim)
         temp_box[u] = self._box_lims[0][u]
         
         indices = in_box(self.prim.x[self.prim.yi_remaining], 
                          temp_box)
         indices = self.prim.yi_remaining[indices]
         
         # total nr. of cases in box with one restriction removed
         Tj = indices.shape[0]  
         
         # total nr. of cases of interest in box with one restriction 
         # removed
         Hj = np.sum(self.prim.y[indices])
         
         p = Hj/Tj
         
         Hbox = int(Hbox)
         Tbox = int(Tbox)
         
         qp = binom.sf(Hbox-1, Tbox, p)
         qp_values[u] = qp
         
     return qp_values
Exemplo n.º 37
0
def quartet_analysis(tree, quartets, perms):
	# generate p-value distribution using Bernoulli model
	numTopo1 = 0
	numTopo2 = 0
	numTopo3 = 0
	for perm in perms:
		nodeA = perm[0][0]
		nodeB = perm[0][1]
		nodeC = perm[1][0]
		nodeD = perm[1][1]
		distAB=tree.distance(nodeA, nodeB)
		distAC=tree.distance(nodeA, nodeC)
		distAD=tree.distance(nodeA, nodeD)
		if distAB == min(distAB, distAC, distAD):
			numTopo1+=1
		elif distAC == min(distAB, distAC, distAD):
			numTopo2+=1
		else:
			numTopo3+=1
	
	P = float(numTopo1) / len(perms)  # P is the binomial dist's p, estimated by permutations
	
	# count diff types of quartets
	if len(quartets) > 1000:
		selQ = random.sample(quartets, 1000)
	else:
		selQ = quartets
		
	numTopo1 = 0
	numTopo2 = 0
	numTopo3 = 0
	for quartet in selQ:
		nodeA = quartet[0][0]
		nodeB = quartet[0][1]
		nodeC = quartet[1][0]
		nodeD = quartet[1][1]
		distAB=tree.distance(nodeA, nodeB)
		distAC=tree.distance(nodeA, nodeC)
		distAD=tree.distance(nodeA, nodeD)
		if distAB == min(distAB, distAC, distAD):
			numTopo1+=1
		elif distAC == min(distAB, distAC, distAD):
			numTopo2+=1
		else:
			numTopo3+=1
	
	# calculate the p using binomial dist.
	p = binom.sf(numTopo1, len(selQ), P)  # one tailed p-val, binomial dist. survival function
	return p
Exemplo n.º 38
0
def prob_of_indel_with_error(input, soft_chr, soft_pos, prob):
	alignment = pysam.Samfile(input,'rb')
	total = alignment.count(soft_chr,soft_pos,soft_pos+1)
	try:
		reads = [read for read in alignment.fetch(soft_chr, soft_pos - 1, soft_pos + 2)]
	except ValueError:
		reads = [read for read in alignment.fetch(soft_chr, soft_pos, soft_pos + 1)]
	num_soft = 0
	for each in reads:
		if each.is_secondary or each.is_unmapped:
			continue
		soft_len, soft_qual, soft_pos_read = get_softclip_length(each)
		if soft_len != 0 and abs(soft_pos_read - soft_pos) < 2: # +/- 1bp matching
			num_soft += 1
	return binom.sf(num_soft - 1, total, prob)
Exemplo n.º 39
0
def prop_test(df):
    """
    Inspired from R package caret confusionMatrix.R
    """
    from scipy.stats import binom

    x = np.diag(df).sum()
    n = df.sum().sum()
    p = (df.sum(axis=0) / df.sum().sum()).max()
    d = {
        "statistic": x,  # number of successes
        "parameter": n,  # number of trials
        "null.value": p,  # probability of success
        "p.value": binom.sf(x - 1, n, p),  # see https://en.wikipedia.org/wiki/Binomial_test
    }
    return(d)
Exemplo n.º 40
0
 def mask(self, count_threshold = 20, impute_threshold = 0.5):
     """ Mask locations that aren't heterozygotes and the loctions that
     don't meet a read count threshold.
     """
     try:
         # Reducing the dataframe based on annotations
         ref_tup = zip(self.annot.index, self.annot["REF"])
         alt_tup = zip(self.annot.index, self.annot["ALT"])
         ref = self.df.ix[ref_tup]
         alt = self.df.ix[alt_tup]
         # Need to collapse the multi index
         # :TODO find a more rigorous way to do this
         ref.index = self.genotypes.index
         alt.index = self.genotypes.index
         hets = np.logical_or(self.genotypes < 0.5, self.genotypes > 1.5)
         sums = (ref + alt) < count_threshold
         ref[np.logical_or(hets, sums)] = np.NaN
         alt[np.logical_or(hets, sums)] = np.NaN
         self.binom_p = pd.DataFrame(binom.sf(alt - 1, ref + alt, 0.5), columns=self.df.columns,index=ref.index)
         self.ratio = ref/((ref+alt)/float(1))
     except AttributeError:
         print("Need to run set_annotations and set_genotyeps first")
Exemplo n.º 41
0
### Question 3

'''
(a)
Generate 1,000 random numbers with Binomial distribution with 𝑛=44 and 𝑝=0.64.
'''
n = 1000
binomial = bn.BinomialDistribution(0.64, 44)
binomial.generateBinomialDistribution(n)
print('Q3. a) Binomial Mean %f' % binomial.meanRdmNumber())
print('       Binomial Std  %f' % binomial.stdDeviationRdmNumber())
'''
(b) (b) Draw the histogram. Compute the probability that the random variable X that has Binomial (44, 0.64) distribution, is at least 40: 𝑃(𝑋≥40).
Use any statistics textbook or online resources for the exact number for the above probability and compare it with your finding and comment.
'''


binomialdist = binomial.getBinomialDistribution()
bins = np.linspace(0, 44, 45)
plt.hist(binomialdist, bins, alpha=0.5)
plt.title("Histogram of Binomial distribution(0.64,44)")
plt.xlabel("Random Number")
plt.ylabel("Frequency")
plt.show()

cdf_binfunction = binomial.getCumulativeDistribution(40)
print('Q3. b) My Binomial P(X>= 40) %f' % (1-cdf_binfunction))

print('       Built in Binomial P(X>=40) %f ' % binom.sf(40, 44, 0.64))
 def is_significant(self, alpha=0.05):
     Ny = len(self.id_list)
     N = Ny + len(self.id_list_negated)
     p_value = binom.sf(Ny, N, self.mcs)
     # print Ny, N, self.mcs, p_value
     return p_value < alpha
 def get_p_value(self):
     Ny = len(self.id_list)
     N = Ny + len(self.id_list_negated)
     p_value = binom.sf(Ny, N, self.mcs)
     return p_value
Exemplo n.º 44
0
def alleles(k, N):
    prob = binom.sf(N-1, 2**k, .25, loc=0) #1-cdf, where cdf is P<=quantile
    print prob
plt.style.use('Solarize_Light2')
from scipy.special import comb
from scipy.stats import binom
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeigborsClassifier
from sklearn.tree
# %%
error_range = np.arange(0.0,1.01,0.01)
n_classifier = 11
np.ceil(n_classifier/2)
ens_errors=binom.sf(n_classifier - np.ceil(n_classifier/2),n_classifier,error_range)
plt.plot(error_range,ens_errors,linewidth=2,label= 'Ensemble Errors')
plt.plot(error_range,error_range,linestyle = '--',label = 'Base error')
plt.legend(loc='upper left')
plt.show()
# %%
# start writing for majority clasfier
# class MajorityVoteClassifier_sush():
#     return None
# %%
iris = datasets.load_iris()
X,y = iris.data[50:,[1,2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.5,random_state = 1,stratify =y)
Exemplo n.º 46
0
def main():
    usage = 'usage: %prog [options] <feature gff>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % os.environ['HOME'])
    parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a gff file for the feature of interest.')
    else:
        feature_gff = args[0]

    ############################################
    # GFF filter
    ############################################
    # filter TEs and features by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter feature GFF
        feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp()
        subprocess.call('intersectBed -u -f 0.5 -a %s -b %s > %s' % (feature_gff, filter_merged_bed_file, feature_gff_gff_file), shell=True)
        feature_gff = feature_gff_gff_file

    ############################################
    # lengths
    ############################################
    # compute feature length
    feature_len, feature_num = feature_stats(feature_gff)    

    if feature_num == 0:
        print >> sys.stderr, 'Zero features'
        exit()

    # compute size of search space
    if options.filter_gff:
        genome_length = count_bed(filter_merged_bed_file, feature_len)
    else:
        genome_length = count_hg19()

    # hash counted repeat genomic bp
    te_lengths = te_target_size(options.repeats_gff, feature_len)

    ############################################
    # hash TE/feature overlaps
    ############################################
    # initialize
    te_features = {}
    for rep, fam in te_lengths:
        if options.strand_split:
            te_features[(rep+'+',fam)] = set()
            te_features[('*+',fam)] = set()
            te_features[('*+','*')] = set()
            te_features[(rep+'-',fam)] = set()
            te_features[('*-',fam)] = set()
            te_features[('*-','*')] = set()
        else:
            te_features[(rep,fam)] = set()
            te_features[('*',fam)] = set()
            te_features[('*','*')] = set()
        
    p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (options.repeats_gff,feature_gff), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        a = line.split('\t')
        
        kv = gff.gtf_kv(a[8])
        rep = kv['repeat']
        fam = kv['family']

        fchrom = a[9]
        fstart = int(a[12])
        fend = int(a[13])

        rep_star = '*'
        if options.strand_split:
            tstrand = a[6]
            fstrand = a[15]
            if tstrand == fstrand:
                rep += '+'
                rep_star += '+'
            else:
                rep += '-'
                rep_star += '-'

        te_features[(rep,fam)].add((fchrom,fstart,fend))
        te_features[(rep_star,fam)].add((fchrom,fstart,fend))
        te_features[(rep_star,'*')].add((fchrom,fstart,fend))

    p.communicate()

    ############################################SW
    # compute stats and print
    ############################################
    lines = []
    p_vals = []
    for te in te_features:
        rep, fam = te

        if options.strand_split:
            te_len = te_lengths[(rep[:-1],fam)]
            te_p = float(te_len) / (2*genome_length)
        else:
            te_len = te_lengths[(rep,fam)]
            te_p = float(te_len) / genome_length
        
        te_count = len(te_features.get(te,[]))
        exp_count = te_p * feature_num

        fold_change = te_count / exp_count

        if fold_change > 1:
            p_val = binom.sf(te_count-1, feature_num, te_p)
        else:
            p_val = binom.cdf(te_count, feature_num, te_p)
        
        p_vals.append(p_val)

        cols = (rep, fam, te_len, te_count, exp_count, fold_change, p_val)
        lines.append('%-18s %-18s %9d %8d %8.1f %8.2f %10.2e' % cols)

    # correct for multiple hypotheses correction
    q_vals = fdr.ben_hoch(p_vals)
    for i in range(len(lines)):
        qline = lines[i] + ' %10.2e' % q_vals[i]
        print qline

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)
        os.close(te_gff_fd)
        os.remove(te_gff_file)
        os.close(feature_gff_gff_fd)
        os.remove(feature_gff_gff_file)
Exemplo n.º 47
0
def create_plot(
    meme_file,
    motif_number,
    flanking_sites,
    sample_phylop_file,
    control_phylop_file,
    sample_gerp_file,
    control_gerp_file,
    peak_file,
    fimo_file,
    annotate,
):
    handle = open(meme_file)
    records = motifs.parse(handle, "meme")
    record = records[motif_number - 1]
    num_occurrences = getattr(record, "num_occurrences", "Unknown")
    sample_phylo_data = None
    control_phylo_data = None
    sample_gerp_data = None
    control_gerp_data = None
    annotate_dict = None
    if annotate == "" or annotate == " ":
        annotate = None
    elif annotate:
        with open(annotate) as f:
            annotate_dict = json.load(f)

    handle = open(sample_phylop_file, "r")
    sample_phylo_data = csv.reader(handle, delimiter="\t")

    handle = open(control_phylop_file, "r")
    control_phylo_data = csv.reader(handle, delimiter="\t")

    if sample_gerp_file and control_gerp_file:

        handle = open(sample_gerp_file, "r")
        sample_gerp_data = csv.reader(handle, delimiter="\t")

        handle = open(control_gerp_file, "r")
        control_gerp_data = csv.reader(handle, delimiter="\t")

    sample_phylo_scores = []
    for line in sample_phylo_data:
        sample_phylo_scores.append(float(line[1]))
    control_phylo_scores = []
    for line in control_phylo_data:
        control_phylo_scores.append(float(line[1]))

    if sample_gerp_data:
        sample_gerp_scores = []
        for line in sample_gerp_data:
            sample_gerp_scores.append(float(line[1]))
        control_gerp_scores = []
        for line in control_gerp_data:
            control_gerp_scores.append(float(line[1]))

    assert len(sample_phylo_scores) == len(control_phylo_scores)

    handle.close()
    profile = position_wise_profile(getattr(record, score_type), record.length)
    max_occur = find_max_occurence(profile, max_count=1)
    ## motif_scores is tn array of scores of the max  occuring base at each position of the motif
    motif_scores = []
    for position in max_occur:
        motif_scores.append(position[0][1])

    motif_scores = np.asarray(motif_scores)
    sample_phylo_scores = np.asarray(sample_phylo_scores)
    control_phylo_scores = np.asarray(control_phylo_scores)
    if sample_gerp_data:
        sample_gerp_scores = np.asarray(sample_gerp_scores)
        control_gerp_scores = np.asarray(control_gerp_scores)

    motif_junk = [0 for i in range(0, flanking_sites)]
    motif_junk = np.asarray(motif_junk)
    motif_concat = np.concatenate((motif_junk, motif_scores))
    motif_concat = np.concatenate((motif_concat, motif_junk))

    ##Mean of flanking sites
    ms_p = np.mean(np.concatenate((sample_phylo_scores[0:flanking_sites], sample_phylo_scores[-flanking_sites:])))
    mc_p = np.mean(np.concatenate((control_phylo_scores[0:flanking_sites], control_phylo_scores[-flanking_sites:])))

    if sample_gerp_data:
        ms_g = np.mean(np.concatenate((sample_gerp_scores[0:flanking_sites], sample_gerp_scores[-flanking_sites:])))
        mc_g = np.mean(np.concatenate((control_gerp_scores[0:flanking_sites], control_gerp_scores[-flanking_sites:])))
        flanking_sample_gerp_scores = np.concatenate(
            (sample_gerp_scores[0:flanking_sites], sample_gerp_scores[-flanking_sites:])
        )
        flanking_control_gerp_scores = np.concatenate(
            (control_gerp_scores[0:flanking_sites], control_gerp_scores[-flanking_sites:])
        )
        motif_control_gerp_scores = control_gerp_scores[flanking_sites:-flanking_sites]
        motif_sample_gerp_scores = sample_gerp_scores[flanking_sites:-flanking_sites]

    flanking_sample_phylo_scores = np.concatenate(
        (sample_phylo_scores[0:flanking_sites], sample_phylo_scores[-flanking_sites:])
    )
    flanking_control_phylo_scores = np.concatenate(
        (control_phylo_scores[0:flanking_sites], control_phylo_scores[-flanking_sites:])
    )
    motif_control_phylo_scores = control_phylo_scores[flanking_sites:-flanking_sites]
    motif_sample_phylo_scores = sample_phylo_scores[flanking_sites:-flanking_sites]

    if flanking_sites > 0:
        shifted_sample_phylo_scores = sample_phylo_scores[flanking_sites:-flanking_sites] - ms_p
        shifted_control_phylo_scores = control_phylo_scores[flanking_sites:-flanking_sites] - mc_p
        if sample_gerp_data:
            shifted_sample_gerp_scores = sample_gerp_scores[flanking_sites:-flanking_sites] - ms_g
            shifted_control_gerp_scores = control_gerp_scores[flanking_sites:-flanking_sites] - mc_g
    else:
        shifted_sample_phylo_scores = sample_phylo_scores
        shifted_control_phylo_scores = control_phylo_scores
        if sample_gerp_data:
            shifted_sample_gerp_scores = sample_gerp_scores
            shifted_control_gerp_scores = control_gerp_scores

    pr_p = pearsonr(motif_scores, motif_sample_phylo_scores)
    if sample_gerp_data:
        pr_g = pearsonr(motif_scores, motif_sample_gerp_scores)

    ## H_0: Mean phylop scores for motif sites and flanking sites are the same
    ## H_!: Mean phylop score for motif sites > Mean phylop score of flanking sites
    ## NOTE: the perform_t_test functions returns a 2 tailed p-value forn independet t-test with unequal sample size, eqaul variances

    T_deltaphylop, p_deltaphylop = perform_t_test(motif_sample_phylo_scores, flanking_sample_phylo_scores)
    delta_phylop = np.mean(motif_sample_phylo_scores) - np.mean(
        flanking_sample_phylo_scores
    )  # -shifted_control_phylo_scores)
    if sample_gerp_data:
        T_deltagerp, p_deltagerp = perform_t_test(motif_sample_gerp_scores, flanking_sample_gerp_scores)
        delta_gerp = np.mean(motif_sample_gerp_scores) - np.mean(flanking_sample_gerp_scores)
        if T_deltagerp < 0:
            p_deltagerp = 1 - p_deltagerp * 0.5
        else:
            p_deltagerp = p_deltagerp * 0.5

    if T_deltaphylop < 0:
        p_deltaphylop = 1 - p_deltaphylop * 0.5
    else:
        p_deltaphylop = p_deltaphylop * 0.5

    ## Ordinary least square fit for phylop scores and motif_scores
    reg_phylop_sample = sm.OLS(motif_sample_phylo_scores, sm.add_constant(motif_scores)).fit()
    if len(reg_phylop_sample.params) < 2:
        y_reg_phylop_sample = motif_scores
    else:
        y_reg_phylop_sample = motif_scores * reg_phylop_sample.params[1] + reg_phylop_sample.params[0]
    reg_phylop_control = sm.OLS(motif_control_phylo_scores, sm.add_constant(motif_scores)).fit()
    if len(reg_phylop_control.params) < 2:
        y_reg_phylop_control = motif_scores
    else:
        y_reg_phylop_control = motif_scores * reg_phylop_control.params[1] + reg_phylop_control.params[0]

    if sample_gerp_data:
        reg_gerp_sample = sm.OLS(motif_sample_gerp_scores, sm.add_constant(motif_scores)).fit()
        if len(reg_gerp_sample.params) == 1:
            y_reg_gerp_sample = motif_scores
        else:
            y_reg_gerp_sample = motif_scores * reg_gerp_sample.params[1] + reg_gerp_sample.params[0]

        reg_gerp_control = sm.OLS(motif_control_gerp_scores, sm.add_constant(motif_scores)).fit()
        if len(reg_gerp_control.params) == 1:
            y_reg_gerp_control = motif_scores
        else:
            y_reg_gerp_control = motif_scores * reg_gerp_control.params[1] + reg_gerp_control.params[0]

    motif = record
    motif_length = motif.length
    meme_dir = os.path.dirname(meme_file)
    X = [40 + 15]  ## this is by trial and error, the position for the first base logo
    logo = plt.imread(os.path.join(meme_dir, "logo{}.png".format(motif_number)))
    ## Generate all other X coordinates
    fs = flanking_sites
    for j in range(1, len(motif) + 2 * fs):
        t = X[j - 1] + a + 1.9
        X.append(t)
    motif_bits = []
    for i in range(0, motif.length):
        s = 0
        for base in bases:
            s = s + -motif.pwm[base][i] * log(motif.pwm[base][i], 2) if motif.pwm[base][i] != 0 else s
            s = 2 - s
        motif_bits.append(s)

    y_phylop_pixels = [__scale__ * x for x in sample_phylo_scores]  # [fs:-fs]]#[flanking_sites:-flanking_sites]]

    ##FIXME This is a big dirty hacl to get thegenerate plots for the Reverse complement logo too
    logo_name = ["logo{}.png".format(motif_number), "logo_rc{}.png".format(motif_number)]
    for ln in logo_name:
        if "rc" in ln:
            y_phylop_pixels.reverse()
        logo = plt.imread(os.path.join(meme_dir, ln))
        height_px = logo.shape[0]  # Should be 212

        if sample_gerp_data:
            if annotate:
                total_px = X[-1] + 8 * height_px + 140
                right = (8 * height_px + 10 + 140 - 0.2 * height_px) / total_px
            else:
                total_px = X[-1] + 6 * height_px + 140
                right = (6 * height_px + 10 + 140 - 0.2 * height_px) / total_px
        else:
            if annotate:
                total_px = X[-1] + 6 * height_px + 140
                right = (6 * height_px + 10 + 140 - 0.2 * height_px) / total_px
            else:
                total_px = X[-1] + 4 * height_px + 140
                right = (4 * height_px + 10 + 140 - 0.2 * height_px) / total_px

        figsize = (total_px / 100, (2 * height_px) / 100 + 0.6)

        gs = gridspec.GridSpec(2, 1)  # , width_ratios=[1, right], height_ratios=[1,1])
        gs.update(
            top=1.0, bottom=0.14, left=0.08, right=1 - right
        )  # , right=0.8)#, left=0.06)#, right=right, wspace=0.025, hspace=0.03, wd)
        f = plt.figure(figsize=figsize, dpi=dpi, facecolor="w", edgecolor="k")

        # ax => Logo
        # stem_plot => Trend
        # gerp_scatter_plot => Phylop
        # enrichment_plot => Gerp
        logo_plot = plt.Subplot(f, gs[0])
        ##TODO Check this
        if motif_length > 45:
            XSCALE_FACTOR = motif_length / 1.9
            z = 2
        elif motif_length > 40:
            XSCALE_FACTOR = motif_length / 2.25
            z = 2.5
        elif motif_length > 36:
            XSCALE_FACTOR = motif_length / 1.95
            z = 2
        elif motif_length > 21:
            XSCALE_FACTOR = motif_length / 5
            z = 3
        else:
            XSCALE_FACTOR = 4.5
            z = 3

        logo_plot.imshow(
            logo, extent=[40 + 15 + z * (a + 1.9), logo.shape[1] + 15 + XSCALE_FACTOR * (a + 1.9), 0, logo.shape[0]]
        )
        logo_plot.set_axis_off()
        f.add_subplot(logo_plot)

        stem_plot = plt.Subplot(f, gs[1], sharex=logo_plot)
        markerline, stemlines, baseline = stem_plot.stem(
            X[:fs],
            [y for y in y_phylop_pixels[:fs]],
            markerfmt="_",
            linefmt="-",
            markerfacecolor=flankingstemcolor,
            color=greycolor,
        )
        setp(stemlines, "color", flankingstemcolor)
        setp(markerline, "markerfacecolor", flankingstemcolor)
        setp(markerline, "color", flankingstemcolor)
        setp(stemlines, "linewidth", linewidth)
        setp(markerline, "markersize", markersize)
        setp(baseline, "linewidth", linewidth - 0.5)
        setp(markerline, "markeredgewidth", markeredgewidth)
        markerline, stemlines, baseline = stem_plot.stem(
            X[fs:-fs], [y for y in y_phylop_pixels[fs:-fs]], markerfmt="g_", linefmt="g-", basefmt="r-"
        )
        setp(stemlines, "linewidth", linewidth)
        setp(markerline, "markersize", markersize)
        setp(markerline, "markeredgewidth", markeredgewidth)
        setp(baseline, "linewidth", linewidth - 0.5)
        markerline, stemlines, baseline = stem_plot.stem(
            X[-fs:],
            [y for y in y_phylop_pixels[-fs:]],
            markerfmt="_",
            linefmt="-",
            markerfacecolor=flankingstemcolor,
            color=greycolor,
        )
        setp(stemlines, "color", flankingstemcolor)
        setp(markerline, "markerfacecolor", flankingstemcolor)
        setp(stemlines, "linewidth", linewidth)
        setp(markerline, "markersize", markersize)
        setp(markerline, "markeredgewidth", markeredgewidth)
        setp(markerline, "color", flankingstemcolor)
        setp(baseline, "linewidth", linewidth - 0.5)

        indices_str = []
        indices1 = np.linspace(-fs, -1, 2)
        for i in indices1:
            indices_str.append("")
        indices2 = np.arange(0, len(X) - 2 * fs, 5)
        for i in indices2:
            indices_str.append("${}$".format(int(i) + 1))

        indices3 = np.linspace(motif_length, motif_length + fs - 1, 2)

        for i in indices3:
            indices_str.append("")

        indices12 = np.concatenate((indices1, indices2))
        indices = np.concatenate((indices12, indices3))
        xticks = [X[int(i) + fs] for i in indices]

        max_yticks = 3
        yloc = plt.MaxNLocator(max_yticks)
        stem_plot.yaxis.set_major_locator(yloc)

        # ticks_and_labels = np.linspace(1.02*min(min(y_phylop_pixels), -0.1), 1.02*max(y_phylop_pixels), num = 5, endpoint=True)
        # stem_plot.set_yticks(ticks_and_labels)
        # stem_plot.set_yticklabels(['$%.2f$' %x for x in ticks_and_labels])#(["%0.2f"%(min(y_phylop_pixels)/__scale__), "%0.2f"%(np.mean(y_phylop_pixels)/__scale__), "%0.2f"%(max(y_phylop_pixels)/__scale__)], fontsize=fontsize)
        stem_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Position}$", fontsize=fontsize, fontweight="bold")
        stem_plot.set_xlim([1.2 * a, X[-1] + linewidth * 1.8])
        stem_plot.set_ylim([min(np.min(y_phylop_pixels), -0.01) - 0.03, np.max(y_phylop_pixels, 0.01)])
        stem_plot.get_xaxis().tick_bottom()
        stem_plot.get_yaxis().tick_left()
        stem_plot.set_xticks(xticks)
        stem_plot.set_xticklabels(indices_str, fontsize=fontsize)
        stem_plot.spines["top"].set_visible(False)
        stem_plot.spines["right"].set_visible(False)
        stem_plot.yaxis.set_ticks_position("left")
        stem_plot.xaxis.set_ticks_position("bottom")
        stem_plot.spines["left"].set_position("zero")
        # stem_plot.spines['bottom'].set_position(matplotlib.transforms.Bbox(array([[0.125,0.63],[0.25,0.25]])))
        stem_plot.get_yaxis().set_tick_params(direction="out")
        stem_plot.get_xaxis().set_tick_params(direction="out")
        stem_plot.tick_params(axis="y", which="major", pad=tickpad)
        stem_plot.tick_params(axis="x", which="major", pad=tickpad)
        stem_plot.tick_params("both", length=ticklength, width=2, which="major")
        stem_plot.set_ylabel("$\mathrm{PhyloP}\ \mathrm{Score}$", fontsize=fontsize)
        f.add_subplot(stem_plot)

        if sample_gerp_data:
            if annotate:
                gs1 = gridspec.GridSpec(2, 4, height_ratios=[1, 4], width_ratios=[1, 1, 1, 1])
                gerp_header_subplot_gs = gs1[0, 1]
                gerp_subplot_gs = gs1[1, 1]
                histogram_header_subplot_gs = gs1[0, 2]
                histogram_subplot_gs = gs1[1, 2]
                ann_header_subplot_gs = gs1[0, 3]
                ann_subplot_gs = gs1[1, 3]
            else:
                gs1 = gridspec.GridSpec(2, 3, height_ratios=[1, 4], width_ratios=[1, 1, 1])
                gerp_header_subplot_gs = gs1[0, 1]
                gerp_subplot_gs = gs1[1, 1]
                histogram_header_subplot_gs = gs1[0, 2]
                histogram_subplot_gs = gs1[1, 2]
        else:
            if annotate:
                gs1 = gridspec.GridSpec(2, 3, height_ratios=[1, 4], width_ratios=[1, 1, 1])
                histogram_header_subplot_gs = gs1[0, 1]
                histogram_subplot_gs = gs1[1, 1]
                ann_header_subplot_gs = gs1[0, 2]
                ann_subplot_gs = gs1[1, 2]
            else:
                gs1 = gridspec.GridSpec(2, 2, height_ratios=[1, 4], width_ratios=[1, 1])
                histogram_header_subplot_gs = gs1[0, 1]
                histogram_subplot_gs = gs1[1, 1]

        gs1.update(bottom=0.14, right=0.95, left=1 - right * 0.85, wspace=0.5)

        phlyop_plots_leg = plt.Subplot(f, gs1[0, 0], autoscale_on=True)
        pearsonr_pval = str("%.1g" % pr_p[1])
        if "e" in pearsonr_pval:
            pearsonr_pval += "}"
            pearsonr_pval = pearsonr_pval.replace("e", "*10^{").replace("-0", "-")
        score_pval = str("%.1g" % p_deltaphylop)
        if "e" in score_pval:
            score_pval += "}"
            score_pval = score_pval.replace("e", "*10^{").replace("-0", "-")

        textstr = r"\noindent$R_{pearson}=%.2f$($p=%s$)\\~\\$\Delta_{Phylop}=%.2f$($p=%s$)\\~\\" % (
            pr_p[0],
            pearsonr_pval,
            delta_phylop,
            score_pval,
        )  # , reg_phylop_control.rsquared, num_occurrences*reg_phylop_control.params[1])
        txtx = 1 - legend_xmultiplier * len(textstr) / 100.0
        phlyop_plots_leg.set_frame_on(False)
        phlyop_plots_leg.set_xticks([])
        phlyop_plots_leg.set_yticks([])
        phlyop_plots_leg.text(txtx, txty, textstr, fontsize=legend_fontsize)
        f.add_subplot(phlyop_plots_leg)

        phylop_scatter_plot = plt.Subplot(f, gs1[1, 0], autoscale_on=True)
        fit = np.polyfit(motif_scores, motif_sample_phylo_scores, 1)
        fit_fn = np.poly1d(fit)

        phylop_scatter_plot.scatter(
            motif_scores, motif_sample_phylo_scores, color="g", s=[pointsize for i in motif_scores]
        )
        phylop_scatter_plot.plot(
            motif_scores,
            y_reg_phylop_sample,
            "g",
            motif_scores,
            fit_fn(motif_scores),
            color="g",
            linewidth=plot_linewidth,
        )
        phylop_scatter_plot.scatter(
            motif_scores, motif_control_phylo_scores, color=greycolor, s=[pointsize for i in motif_scores]
        )
        phylop_scatter_plot.plot(motif_scores, y_reg_phylop_control, color=greycolor, linewidth=plot_linewidth)

        ticks_and_labels = np.linspace(1.02 * min(motif_scores), 1.02 * max(motif_scores), num=5, endpoint=True)
        phylop_scatter_plot.set_xticks(ticks_and_labels)
        ticks_and_labels = ["$%.2f$" % (x / num_occurrences) for x in ticks_and_labels]
        phylop_scatter_plot.set_xticklabels(ticks_and_labels)

        ##max_xticks = 5
        ##xloc = plt.MaxNLocator(max_xticks)
        ##print xloc
        ##phylop_scatter_plot.xaxis.set_major_locator(xloc)
        # ticks_and_labels = np.linspace(1.02*min(min(shifted_sample_phylo_scores), min(shifted_control_phylo_scores)), 1.02*max(max(shifted_sample_phylo_scores),max(shifted_control_phylo_scores)),
        # num = 4, endpoint=True)
        # phylop_scatter_plot.set_yticks(ticks_and_labels)
        # phylop_scatter_plot.set_yticklabels(["$%0.2f$"%x for x in ticks_and_labels])
        max_yticks = 4
        yloc = plt.MaxNLocator(max_yticks)
        phylop_scatter_plot.yaxis.set_major_locator(yloc)
        phylop_scatter_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Frequency}$", fontsize=fontsize, fontweight="bold")
        phylop_scatter_plot.get_xaxis().tick_bottom()
        phylop_scatter_plot.get_yaxis().tick_left()
        phylop_scatter_plot.set_ylabel("$\mathrm{PhyloP}\ \mathrm{Score}$", fontsize=fontsize, fontweight="bold")
        phylop_scatter_plot.tick_params(axis="y", which="major", pad=tickpad)
        phylop_scatter_plot.tick_params(axis="x", which="major", pad=tickpad)
        phylop_scatter_plot.get_yaxis().set_tick_params(direction="out")
        phylop_scatter_plot.get_xaxis().set_tick_params(direction="out")
        phylop_scatter_plot.tick_params("both", length=ticklength, width=2, which="major")

        f.add_subplot(phylop_scatter_plot)

        gerp_plots_leg = plt.Subplot(f, gerp_header_subplot_gs, autoscale_on=True)
        gerp_plots_leg.set_frame_on(False)
        gerp_plots_leg.set_xticks([])
        gerp_plots_leg.set_yticks([])
        pearsonr_pval = str("%.1g" % pr_p[1])
        if "e" in pearsonr_pval:
            pearsonr_pval += "}"
            pearsonr_pval = pearsonr_pval.replace("e", "*10^{").replace("-0", "-")

        if sample_gerp_data:
            score_pval = str("%.1g" % p_deltagerp)
            if "e" in score_pval:
                score_pval += "}"
                score_pval = score_pval.replace("e", "*10^{").replace("-0", "-")
            textstr = r"\noindent$R_{pearson}=%.2f$($p=%s$)\\~\\$\Delta_{{Gerp}}=%.2f$($p=%s$)\\~\\" % (
                pr_g[0],
                pearsonr_pval,
                delta_gerp,
                score_pval,
            )
            txtx = 1 - legend_xmultiplier * len(textstr) / 100.0
            gerp_plots_leg.text(txtx, txty, textstr, fontsize=legend_fontsize)
            f.add_subplot(gerp_plots_leg)

            gerp_scatter_plot = plt.Subplot(f, gerp_subplot_gs, autoscale_on=True)
            gerp_scatter_plot.scatter(
                motif_scores, motif_sample_gerp_scores, color="g", s=[pointsize for i in motif_scores]
            )
            gerp_scatter_plot.plot(motif_scores, y_reg_gerp_sample, color="g", linewidth=plot_linewidth)
            gerp_scatter_plot.scatter(
                motif_scores, motif_control_gerp_scores, color=greycolor, s=[pointsize for i in motif_scores]
            )
            gerp_scatter_plot.plot(motif_scores, y_reg_gerp_control, color=greycolor, linewidth=plot_linewidth)
            ticks_and_labels = np.linspace(1.02 * min(motif_scores), 1.02 * max(motif_scores), num=5, endpoint=True)
            gerp_scatter_plot.set_xticks(ticks_and_labels)
            ticks_and_labels = ["$%.2f$" % (x / num_occurrences) for x in ticks_and_labels]
            gerp_scatter_plot.set_xticklabels(ticks_and_labels)

            ##max_xticks = 5
            ##xloc = plt.MaxNLocator(max_xticks)
            ##gerp_scatter_plot.xaxis.set_major_locator(xloc)
            max_yticks = 4
            yloc = plt.MaxNLocator(max_yticks)
            gerp_scatter_plot.yaxis.set_major_locator(yloc)
            gerp_scatter_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Frequency}$", fontsize=fontsize, fontweight="bold")
            gerp_scatter_plot.set_ylabel("$\mathrm{GERP}\ \mathrm{Score}$", fontsize=fontsize, fontweight="bold")
            gerp_scatter_plot.get_xaxis().tick_bottom()
            gerp_scatter_plot.get_yaxis().tick_left()
            gerp_scatter_plot.get_yaxis().set_tick_params(direction="out")
            gerp_scatter_plot.get_xaxis().set_tick_params(direction="out")
            gerp_scatter_plot.tick_params(axis="y", which="major", pad=tickpad)
            gerp_scatter_plot.tick_params(axis="x", which="major", pad=tickpad)
            gerp_scatter_plot.tick_params("both", length=ticklength, width=2, which="major")
            f.add_subplot(gerp_scatter_plot)

        enrichment_plot4 = plt.Subplot(f, histogram_header_subplot_gs, autoscale_on=True)
        enrichment_plot4.set_frame_on(False)
        enrichment_plot4.set_xticks([])
        enrichment_plot4.set_yticks([])
        all_distances = get_motif_distances(peak_file, fimo_file)
        fimo_dir = os.path.dirname(fimo_file)
        motifs_within_100 = filter(lambda x: x <= 100 and x >= -100, all_distances)
        motifs_within_100_200 = filter(lambda x: (x < 200 and x > 100) or (x > -200 and x < -100), all_distances)
        if len(motifs_within_100_200) > 0:
            enrichment = len(motifs_within_100) / (len(motifs_within_100_200))  # +len(motifs_within_100))
        else:
            enrichment = 1
        enrichment_pval = 0
        number_of_sites = len(motifs_within_100) + len(motifs_within_100_200)  # fimo_sites_intersect(parsed.fimo_file)
        probability = 200 / (ENRICHMENT_SEQ_LENGTH - motif_length)
        enrichment_pval = binom.sf(len(motifs_within_100), number_of_sites, probability)
        enrichment_pval = str("%.1g" % enrichment_pval)
        if "e" in enrichment_pval:
            enrichment_pval += "}"
            enrichment_pval = enrichment_pval.replace("e", "*10^{").replace("-0", "-")
        textstr = r"\noindent$Enrichment={0:.2f}$\\~\\$(p={1})$".format(enrichment, enrichment_pval)
        txtx = 0.1 * len(textstr) / 100.0
        enrichment_plot4.text(txtx, txty, textstr, fontsize=legend_fontsize)
        f.add_subplot(enrichment_plot4)
        enrichment_plot = plt.Subplot(f, histogram_subplot_gs, autoscale_on=True)
        enrichment_plot.hist(all_distances, histogram_nbins, color="white", alpha=0.8, range=[-200, 200])
        enrichment_plot.set_xticks([-200, -100, 0, 100, 200])
        max_yticks = 3
        yloc = plt.MaxNLocator(max_yticks)
        enrichment_plot.yaxis.set_major_locator(yloc)
        # enrichment_plot.set_yticks(range(1,6))
        ticks_and_labels = [-200, -100, 0, 100, 200]
        all_distances = np.asarray(all_distances)
        enrichment_plot.set_xticklabels(["${}$".format(x) for x in ticks_and_labels])
        enrichment_plot.tick_params(axis="y", which="major", pad=tickpad)
        enrichment_plot.tick_params(axis="x", which="major", pad=tickpad)
        enrichment_plot.tick_params("both", length=ticklength, width=2, which="major")
        enrichment_plot.get_xaxis().tick_bottom()
        enrichment_plot.get_yaxis().tick_left()
        enrichment_plot.get_yaxis().set_tick_params(direction="out")
        enrichment_plot.get_xaxis().set_tick_params(direction="out")
        enrichment_plot.axvline(x=-100, linewidth=3, color="red", linestyle="-.")
        enrichment_plot.axvline(x=100, linewidth=3, color="red", linestyle="-.")
        f.add_subplot(enrichment_plot)
        if "rc" not in ln:
            out_file = os.path.join(fimo_dir, "motif{}Combined_plots.png".format(motif_number))
            out_file = "motif{}Combined_plots.png".format(motif_number)
        else:
            out_file = os.path.join(fimo_dir, "motif{}Combined_plots_rc.png".format(motif_number))
            out_file = "motif{}Combined_plots_rc.png".format(motif_number)

        if annotate:
            filename = r"$" + annotate[0] + "$"
            try:
                a_motif = r"$" + annotate[1] + "$"
            except IndexError:
                a_motif = ""
            try:
                cell_line = r"$" + annotate[2] + "$"
            except IndexError:
                cell_line = ""
            try:
                assay = r"$" + annotate[3] + "$"
            except IndexError:
                assay = ""

            # data = [[r'$Filename$', filename], [r'$Motif$', a_motif], [r'$Cell\ Line$', cell_line], [r'Assay', assay]]
            keys = ["title", "gene_name", "dataset", "assembly"]
            data = [[r"$" + key.replace("_", " ").upper() + "$", r"$" + annotate_dict[key] + "$"] for key in keys]
            ann_header = plt.Subplot(f, ann_header_subplot_gs, autoscale_on=True)
            ann_header.set_frame_on(False)
            ann_header.set_xticks([])
            ann_header.set_yticks([])
            f.add_subplot(ann_header)
            textstr = r"$Metadata$"
            txtx = 1.7 * len(textstr) / 100.0
            ann_header.text(txtx, txty, textstr, fontsize=legend_fontsize)
            ann_plot = plt.Subplot(f, ann_subplot_gs, autoscale_on=True)
            ann_plot.set_xticks([])
            ann_plot.set_yticks([])
            ann_plot.set_frame_on(False)
            table = ann_plot.table(cellText=data, loc="center")
            table.scale(1, 2)
            fontproperties = FontProperties(size=legend_fontsize * 8)  # , family='serif' )
            for key, cell in table.get_celld().items():
                row, col = key
                if row > 0 and col > 0:
                    cell.set_text_props(fontproperties=fontproperties)

            table.set_fontsize(legend_fontsize * 8)
            f.add_subplot(ann_plot)

        f.savefig(out_file, figsize=figsize, dpi=dpi)
Exemplo n.º 48
0
	def codegen_range_checks(self, probs, failchance):
		# probs:  List of tuples of (probability, orders)
		#   where probability: float in range [0., 1.]
		#   and orders: list of strings representing selection orders with this probability.
		#
		# failchance:  Chance of Type I error (i.e. chance of a test failing for working code
		#   due to random chance alone).  Too small of a magnitude results in worthless tests
		#   unless the number of samples is very large to compensate.
		#
		# Output:
		#   Prints code for verifying that the number of samples counted for each selection order
		#   falls within acceptible bounds.
		self.assertIsNot(self.status, self.STATUS_PREPARE, 'codegen should only be done after self.prepare()')

		self.assertAlmostEqual(1., sum(sorted(p*len(os) for (p,os) in probs)), 'total probability should be 1')

		allorders = []
		for p,os in probs: allorders.extend(os)
		self.assertEqual(len(allorders), len(set(allorders)), 'all orders in probs should be unique')

		# find acceptible ranges for each item
		N = self.nsamples
		ranges = []
		for prob, orders in probs:
			cdfvals = binom.cdf(np.arange(N+1), N, prob)
			sfvals  = binom.sf(N-np.arange(N+1), N, prob) # in backwards (increasing) order for bisect

			ranges.append((
				bisect_left(cdfvals, failchance),     # lo
				N - bisect_right(sfvals, failchance)  # hi
			))

		# format string for range (align the numbers)
		lolen = max(len(str(lo)) for (lo,hi) in ranges)
		hilen = max(len(str(hi)) for (lo,hi) in ranges)
		range_fmt = '({:%dd}, {:%dd})' % (lolen, hilen)

		# sort descendingly by hi for easier comparison of ranges
		zipped = [(p,o,l,h) for (p,o),(l,h) in zip(probs,ranges)]
		zipped.sort(key=lambda tup: -tup[3])
		probs,ranges = zip(*[((p,o),(l,h)) for p,o,l,h in zipped])

		# go go gadget obnoxiously large comment
		print('######################################################')
		print('# BEGIN CODE AUTOGENERATED BY codegen_range_checks()')
		print('# Parameters:')
		print('#    Number of samples: {:d}'.format(self.nsamples))
		print('#    Chance of spontaneous failure: ~{:g}'.format(failchance))
		print('#')
		print('# The first two numbers of each line are the "range" of accepted counts in the')
		print('#  final distribution. Ideally you want to MINIMIZE OVERLAP between ranges for')
		print('#  different rates of occurrence (the comment in each line).')
		print('#')
		print('# The overlap can be reduced by increasing the number of samples, or by increasing')
		print('#  failchance by a few orders of magnitude.')

		PER_LINE = 6
		for (prob, orders), (lo, hi) in zip(probs, ranges):
			# put orders into compact string form in case they have been tuplefied
			orders = [''.join(x) for x in orders]

			for i in range(0, len(orders), PER_LINE):
				range_args = range_fmt.format(lo, hi)
				order_args = ', '.join(repr(x) for x in orders[i:i+PER_LINE]) # repr to quote
				print('self.validate_range({}, {}) # p = {:0.4f}'.format(range_args, order_args, prob))

		print('#            END AUTOGENERATED CODE')
		print('######################################################')
Exemplo n.º 49
0
def main():
    usage = 'usage: %prog [options] <bam_file,bam_file2,...>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='control_bam_files', help='Control BAM file to paramterize null distribution [Default: %default]')
    parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]')
    parser.add_option('-m', dest='mapq', default=False, action='store_true', help='Consider only reads with mapq>0 [Default: %default]')
    parser.add_option('-r', dest='repeats_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])
    parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide a BAM file.')
    else:
        bam_files = args[0].split(',')

    control_bam_files = []
    if options.control_bam_files:
        control_bam_files = options.control_bam_files.split(',')

    ############################################
    # GFF filter
    ############################################
    # filter TEs and read alignments by gff file
    if options.filter_gff:
        filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp()
        subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True)

        # filter TE GFF
        te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
        subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True)
        options.repeats_gff = te_gff_file

        # filter BAM
        bam_gff_fds = [None]*len(bam_files)
        bam_gff_files = [None]*len(bam_files)
        for i in range(len(bam_files)):
            bam_gff_fds[i], bam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
            bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_files[i])
            bam_files[i] = bam_gff_files[i]

        # filter control BAM
        if control_bam_files:
            cbam_gff_fds = [None]*len(control_bam_files)
            cbam_gff_files = [None]*len(control_bam_files)
            for i in range(len(control_bam_files)):
                cbam_gff_fds[i], cbam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME'])
                bedtools.abam_f1(control_bam_files[i], filter_merged_bed_file, cbam_gff_files[i])
                control_bam_files[i] = cbam_gff_files[i]

    ############################################
    # lengths
    ############################################
    # estimate read length (just averaging across replicates for now)
    read_lens = []
    for bam_file in bam_files:
        read_lens.append(estimate_read_length(bam_file))
    read_len = stats.mean(read_lens)

    # compute size of search space
    if options.filter_gff:
        genome_length = count_bed(filter_merged_bed_file, read_len)
    else:
        genome_length = count_hg19()

    # hash counted repeat genomic bp
    if options.filter_gff:
        te_lengths = te_target_size_bed(options.repeats_gff, filter_merged_bed_file, read_len)
    else:
        te_lengths = te_target_size(options.repeats_gff, read_len)

    ############################################
    # count TE fragments
    ############################################
    fragments = []
    te_fragments = []
    for bam_file in bam_files:
        rep_fragments, rep_te_fragments = count_te_fragments(bam_file, options.repeats_gff, options.strand_split)
        fragments.append(rep_fragments)
        te_fragments.append(rep_te_fragments)

    if control_bam_files:        
        control_fragments = []
        control_te_fragments = []
        for control_bam_file in control_bam_files:
            rep_fragments, rep_te_fragments = count_te_fragments(control_bam_file, options.repeats_gff, options.strand_split)
            control_fragments.append(rep_fragments)
            control_te_fragments.append(rep_te_fragments)

    ############################################
    # combine replicates into fragment rates
    ############################################
    te_fragment_rates = {}
    for (rep,fam) in te_lengths:
        if options.strand_split:
            # positive
            rate_list = [te_fragments[i].get((rep+'+',fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep+'+',fam)] = stats.geo_mean(rate_list)
            # negative
            rate_list = [te_fragments[i].get((rep+'-',fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep+'-',fam)] = stats.geo_mean(rate_list)
        else:
            rate_list = [te_fragments[i].get((rep,fam),1)/float(fragments[i]) for i in range(len(bam_files))]
            te_fragment_rates[(rep,fam)] = stats.geo_mean(rate_list)

    if control_bam_files:
        control_te_fragment_rates = {}
        for te in te_fragment_rates:
            rate_list = [control_te_fragments[i].get(te,1)/float(control_fragments[i]) for i in range(len(control_bam_files))]
            control_te_fragment_rates[te] = stats.geo_mean(rate_list)

    ############################################
    # compute stats, print table
    ############################################
    for (rep,fam) in te_fragment_rates:
        # compute TE length
        if options.strand_split:
            te_len = te_lengths[(rep[:-1],fam)]
        else:
            te_len = te_lengths[(rep,fam)]

        # parameterize null model
        if options.control_bam_files:
            null_rate = control_te_fragment_rates[(rep,fam)]
        else:
            if options.strand_split:
                null_rate = float(te_lengths[(rep[:-1],fam)]) / (2*genome_length)
            else:
                null_rate = float(te_lengths[(rep,fam)]) / genome_length

        # compute fragment counts
        count = te_fragment_rates[(rep,fam)]*sum(fragments)
        null_count = null_rate*sum(fragments)

        # compute fold change
        if null_rate > 0:
            fold = te_fragment_rates[(rep,fam)]/null_rate
        else:
            fold = 0

        # compute p-value of enrichment/depletion
        p_val = 1.0
        for i in range(len(bam_files)):
            if te_fragment_rates[(rep,fam)] > null_rate:            
                p_val *= binom.sf(int(te_fragments[i].get((rep,fam),1))-1, int(fragments[i]), null_rate)
            else:
                p_val *= binom.cdf(int(te_fragments[i].get((rep,fam),1)), int(fragments[i]), null_rate)

        cols = (rep, fam, te_len, count, null_count, fold, p_val)
        print '%-18s %-18s %10d %10.1f %10.1f %10.3f %10.2e' % cols

    ############################################
    # clean
    ############################################
    if options.filter_gff:
        os.close(filter_merged_bed_fd)
        os.remove(filter_merged_bed_file)
        os.close(te_gff_fd)
        os.remove(te_gff_file)

        for i in range(len(bam_files)):
            os.close(bam_gff_fds[i])
            os.remove(bam_gff_files[i])

        if options.control_bam_files:
            for i in range(len(control_bam_files)):
                os.close(cbam_gff_fds[i])
                os.remove(cbam_gff_files[i])
Exemplo n.º 50
0
w=open('political_relevants.txt','w')

for nd in G.nodes():
    if nd not in ['red', 'blue']:
        Cr1, Cr2 = False, False
        # criterion one
        uni_books=len(G.node[nd]['pol_books'])
        if len(G.node[nd]['pol_books'])>=10:
            Cr2=True
            
        # criterion two
        p =  pol_size / (base_nodes - G.node[nd]['size'])
        n = G.node[nd]['cops']
        x = G[nd]['red']['cops'] + G[nd]['blue']['cops'] 
        p_value = binom.sf(x, n, p) # binomial test
        if p_value<0.05:
            Cr1=True            

        if Cr1==True and Cr2==True:
            #print index_category[nd], stripping(index_category[nd])
            scale = (G[nd]['red']['strength']/red_books) / (G[nd]['red']['strength']/red_books + G[nd]['blue']['strength']/blue_books) # disregard this scale measure.
            w.write(str(nd)+'\t'+str(x)+'\t'+str(G[nd]['red']['cops'])+'\t'+str(G[nd]['blue']['cops'])+'\t'+str(scale)+
            '\t'+str(n)+'\t'+str(G.node[nd]['size'])+'\t'+str(H)+'\t'+str(uni_books)+'\t'+str(format(p_value, '.10f'))+'\t'+stripping(index_category[nd])+'\n')
w.close()





# l. 
Exemplo n.º 51
0
def compPairs(fname, minScoreDiff):
    print "Comparing guides from %s, minimum score difference %f" % (fname, minScoreDiff)
    byGene = defaultdict(list) # dict gene -> list of (guideName, modFreq, scores)
    for row in iterTsvRows(fname):
        if float(row.modFreq)==0.0:
            continue
        gene = row.guide.split("-")[0]
        scores = {}
        scores["doench"] = float(row.doench)
        scores["ssc"] = float(row.ssc)
        scores["svm"] = float(row.svm)
        chariRaw, chariRank = lookupchariScore(row.extSeq[4:27])
        scores["chariRaw"] = chariRaw
        scores["chariRank"] = chariRank
        byGene[gene].append( (row.guide, float(row.modFreq), scores) )

    # keep only genes with two guides
    twoGuides = dict()
    for gene, guideList in byGene.iteritems():
        if len(guideList)==2:
            twoGuides[gene]=guideList
        elif len(guideList)>2:
            guideList.sort(key=operator.itemgetter(1))
            twoGuides[gene]=(guideList[0], guideList[1])
        else:
            continue

    # for each gene, test if the order of the modFreq scores is the same as the order of the scores
    scoreNames = ["doench", "ssc", "svm", "chariRaw"]
    okCounts = defaultdict(int)
    for gene, guidePair in twoGuides.iteritems():
        guide1, guide2 = guidePair
        guide1Name, guide2Name = guide1[0], guide2[0]
        freq1, freq2 = guide1[1], guide2[1]
        if abs(freq2-freq1) < minScoreDiff:
            #print abs(freq2-freq1), "is <0.1"
            logging.debug("difference not high enough")
            continue
            
        okCounts["all"] += 1
        scores1, scores2 = guide1[2], guide2[2]
        logging.debug("guides (%s, %s), modFreq (%f, %f), doench (%f,%f), ssc (%f,%f)" % (guide1Name, guide2Name, freq1, freq2, scores1["doench"], scores2["doench"], scores1["ssc"], scores2["ssc"]))
        anyOk = False
        if freq2 > freq1:
            for scoreName in scoreNames:
                if scores2[scoreName] > scores1[scoreName]:
                    logging.debug( scoreName+ " OK")
                    okCounts[scoreName] += 1
                    anyOk = True
        else:
            for scoreName in scoreNames:
                if scores2[scoreName] < scores1[scoreName]:
                    logging.debug( scoreName+ " OK")
                    okCounts[scoreName] += 1
                    anyOk = True
        if not anyOk:
            logging.debug( "No score was OK")

    geneCount = okCounts["all"]
    print "total number of genes:", geneCount
    for scoreType, scoreCount in okCounts.iteritems():
        if scoreType=="all":
            continue
        pVal = binom.sf(scoreCount-1,geneCount,0.5)
        print "%s was correct %d times (p-Val %f)" % (scoreType, scoreCount, pVal)
Exemplo n.º 52
0
def upperBinom(k, n, p):
	"""
	Returns the p-value for the actual proportion being higher than p
	"""
	return binom.sf(k-1, n, p)
#!/usr/bin/env python

#Copyright (c) Payton Ide 2014
#View LICENSE.txt for copyright information
"""
Calculates Probability of winning a game of badminton based on the probability of winning a point
Author: Payton Ide
See README.txt for information regarding scipy stats, the binomial module, and the uses thereof in this program
"""

from scipy.stats import binom

#assign values as descriped in the explanation and solution file
p = input("Enter probablility: ")
e = binom.sf(20, 40, p, loc=0)
i = binom.pmf(20, 40, p, loc=0)
t = 2*p*(1-p)
d = p**2

#calculate game win probabilty using the derived formula
W = e + i*d + i*t*d + i*(t**2)*d + i*(t**3)*d + i*(t**4)*d + i*(t**5)*d + i*(t**6)*d + i*(t**7)*d + i*(t**8)*d + i*(t**9)*p

#calculate match win probabiltiy based on game win probability
V = W**2 + 2*(W**2)*(1-W)

#print user-inputted point win probability, followed by calculated game and match win probabilities
print "Probablity of winning a single point: ", p
print "Probablity of winning a game: ", W
print "Probability of winning a match: ", V

Exemplo n.º 54
0
    print 'Starting N =', N
    for p in p_range:
        num_Np_fails = 0
        num_Np_checks = 0
        for _ in xrange(repeats):
            if check(N, p):
                num_Np_fails += 1
            num_Np_checks += 1
        # work out what the failure probability is (approximately but not exactly 1-alpha
        # because it's a discrete distribution)
        low, high = binom.interval(alpha, N, p)
        if p==0:
            low = high = 0
        elif p==1:
            low = high = N
        q = binom.cdf(low-0.1, N, p)+binom.sf(high+0.1, N, p)
        low, high = binom.interval(alpha, num_Np_checks, q)
        if q==0:
            low = high = 0
        if num_Np_fails<low or num_Np_fails>high:
            print 'N=%d, p=%.3f failed %d of %d checks, outside range (%d, %d)' % (N, p, num_Np_fails,
                                                                                   num_Np_checks, low, high)
print
failrate = float(numfails)/numchecks
low, high = norm.interval(alpha, loc=mu, scale=sqrt(sigma2))
print '%d/%d=%.2f%% failed at %d%%' % (numfails, numchecks, numfails*100.0/numchecks, 100*alpha)
print 'Expected mean=%d, std dev=%d (mean fail rate=%.2f%%)' % (mu, sqrt(sigma2), 100*mu/numchecks)
if low<=numfails<=high:
    print 'Overall passed at %d%%: within range (%d, %d)' % (alpha*100, low, high)
else:
    print 'Overall failed at %d%%: outside range (%d, %d)' % (alpha*100, low, high)