def _jd_at_once(n1,k1,n2,k2,n3):
    if (n1,k1,n2,k2,n3) in joint_cache:
        return joint_cache[(n1,k1,n2,k2,n3)] 
    hg1 = spst.hypergeom(n1,k1,n3)
    hg2 = spst.hypergeom(n2,k2,n3)
    range1 = np.arange(0,min(n3+1,k1+1))
    range2 = np.arange(0,min(n3+1,k2+1))
    #print range1,range2
    if k1 == 0:
        pmf1 = np.zeros(len(range1))
        pmf1[0] = 1.0
        pmf1 = hg1.pmf(range1)
    if k2 == 0:
        pmf2 = np.zeros(len(range2))
        pmf2[0] = 1.0
        pmf2 = hg2.pmf(range2)
    #print pmf1,pmf2
    jpmf = np.outer(pmf1,pmf2)
    mrange1 = np.minimum.outer(range1,range2)
    mrange0 = np.minimum.outer(n3-range1,n3-range2)
    #print mrange1
    #print mrange0
    no_ops = np.logical_and(mrange1==0,mrange0==0)
    #print no_ops
    jpmf[no_ops] = 0.0
    joint_cache[(n1,k1,n2,k2,n3)] = (jpmf,mrange1,mrange0)
    return jpmf, mrange1, mrange0
def coco_stats():

        python -m mtgmonte.stats --exec-coco_stats --show

        >>> # DISABLE_DOCTEST
        >>> from mtgmonte.stats import *  # NOQA
        >>> result = coco_stats()
        >>> print(result)
        >>> ut.show_if_requested()
    import plottool as pt
    from scipy.stats import hypergeom

    N = pop_size = 60  # cards in deck  # NOQA
    K = num_success = 21  # number of creatures in deck  # NOQA
    n = sample_size = 6  # cards seen by coco  # NOQA

    # prob of at least that many hits
    prb = hypergeom(N, K, n)

    k = number_of_success = 1  # number of hits you want  # NOQA

    prb.pmf(k)  # P(X = k)
    prb.cdf(k)  # P(X <= k)

    1 - prb.cdf(k)  # P(X > k)

    (1 - prb.cdf(k)) + prb.pmf(k)  # P(X >= k)

    def prob_ge(k, prb=prb):
        return (1 - prb.cdf(k)) + prb.pmf(k)  # P(X >= k)


    import numpy as np

    k = np.arange(1, 3)

    K_list = np.arange(15, 30)

    label_list = [str(K_) + " creatures in deck" for K_ in K_list]

    ydata_list = [prob_ge(k, prb=hypergeom(N, K_, n)) for K_ in K_list]

        title="probability of at least k hits with coco",
    def fitness_group(self, x, i, j, *args):
        In a population of x i-strategists and (Z-x) j strategists, where players
        interact in group of 'group_size' participants this function
        returns the average payoff of strategies i and j.

        x : int
            number of individuals adopting strategy i in the population
        i : int
            index of strategy i
        j : int
            index of strategy j
        args : List
            Other Parameters. This can be used to pass extra parameters to functions
            stored in the payoff matrix

            Returns the difference in fitness between strategy i and j
        k_array = np.arange(0, self.N, dtype=np.int32)
        i_pmf = hypergeom(self.Z-1, x-1, self.N-1).pmf(k_array)
        j_pmf = hypergeom(self.Z-1, x, self.N-1).pmf(k_array)

        fitness_i, fitness_j = 0, 0
        for k in k_array:
            fitness_i += self.payoffs[i, j](k + 1, self.N, *args)*i_pmf[k]
            fitness_j += self.payoffs[j, i](self.N - k, self.N, *args)*j_pmf[k]

        return fitness_i - fitness_j
def plot_hypergeom(M, N, n):
    x1 = range(min(n, N) + 1)
    x2 = range(n + 1)
    plt.plot(x1, hypergeom(M=M, n=n, N=N).pmf(x1), alpha=0.6, color='gray')
             hypergeom(M=M, n=n, N=N).pmf(x2),
             label='$n={0},N={1},M={2}$'.format(N, M, n))
    def test_entropy(self):
        # Simple tests of entropy.
        hg = stats.hypergeom(4, 1, 1)
        h = hg.entropy()
        expected_p = np.array([0.75, 0.25])
        expected_h = -np.sum(xlogy(expected_p, expected_p))
        assert_allclose(h, expected_h)

        hg = stats.hypergeom(1, 1, 1)
        h = hg.entropy()
        assert_equal(h, 0.0)
def land_stats():

        python -m mtgmonte.stats --exec-land_stats --show

        >>> # DISABLE_DOCTEST
        >>> from mtgmonte.stats import *  # NOQA
        >>> result = land_stats()
        >>> print(result)
        >>> ut.show_if_requested()
    import plottool as pt
    from scipy.stats import hypergeom

    N = pop_size = 60  # cards in deck  # NOQA
    # K = num_success = 25  # lands in deck  # NOQA
    n = sample_size = 6  # cards seen by coco  # NOQA

    # prob of at least that many hits

    def prob_ge(k, prb):
        return (1 - prb.cdf(k)) + prb.pmf(k)  # P(X >= k)


    N = deck_size = 60  # NOQA
    land_range = (24, 27 + 1)

    # N = deck_size = 40  # NOQA
    # land_range = (15, 18 + 1)

    xdata = range(0, 15)  # turn
    ydata_list = [[hypergeom(N, K, x + 7).expect() for x in xdata] for K in range(*land_range)]
    spread_list = [[hypergeom(N, K, x + 7).std() for x in xdata] for K in range(*land_range)]
    # spread_list = None
    import numpy as np

    label_list = ["%d lands" % (K,) for K in range(*land_range)]
        xdata, ydata_list, spread_list=spread_list, label_list=label_list, num_xticks=15, num_yticks=13, fnum=1
    min_lands_acceptable = np.minimum(np.array(xdata), [1, 2, 3, 4, 5, 6] + [6] * (len(xdata) - 6))
        [min_lands_acceptable, (np.array(xdata) ** 0.9) * 0.5 + 4],
        label_list=["minimum ok", "maximum ok"],
 def prob_nohave_card_always_mulled(copies=2, hand_size=3):
     # probability of getting the card initially
     p_none_premul = hypergeom(deck_size, copies, hand_size).cdf(0)
     # probability of getting the card if everything is thrown away
     # (TODO: factor in the probability that you need to keep something)
     # for now its fine because if we keep shadowform the end calculation is fine
     p_nohave_postmul_given_nohave = hypergeom(deck_size - hand_size, copies, hand_size).cdf(0)
     # not necessary, but it shows the theory
     p_nohave_postmul_given_had = 1
     p_nohave_turn0 = (
         p_nohave_postmul_given_nohave * p_none_premul + (1 - p_none_premul) * p_nohave_postmul_given_had
     return p_nohave_turn0
def fisher_exact(table, side="two.sided", zero_correction=True):
    """Computes fisher exact odds ratio.
    Output is almost exactly the same as scipy.stats.fisher_exact but here allows for
    using Haldane–Anscombe correction (substitutes 0.5 for 0 values in the table, whereas
    the scipy.stats version and R version fisher.test use integers only).
    For 95% confidence interval, uses confidence intervals computed by R function fisher.test
    if side not in ("greater", "less", "two.sided"):
        raise ValueError(
            "side parameter must be one of 'greater', 'less', or 'two.sided'")

    # Compute the p value
    # For all possible contingency tables with the observed marginals, compute the hypergeom
    # pmf of that table. Sum the p of all tables with p less than or equal to the hypergeom
    # probability of the observed table.
    N = np.sum(table)
    K = np.sum(table[:, 0])
    n = np.sum(table[0])

    odds_ratio, se = _odds_ratio(table, zero_correction=zero_correction)

    a_min = np.max([0, table[0][0] - table[1][1]])
    a_max = np.min([K, n])

    p_observed = hypergeom(N, K, n).pmf(table[0][0])
    p_value = 0.0
    for a in np.arange(a_min, a_max + 1):
        possible_table = np.array([[a, n - a], [K - a, N - n - K + a]])
        p = hypergeom(N, K, n).pmf(a)

        if side == "greater":
            if _odds_ratio(possible_table)[0] >= odds_ratio:
                p_value += p
        elif side == "less":
            if _odds_ratio(possible_table)[0] <= odds_ratio:
                p_value += p
        elif side == "two.sided":
            if p <= p_observed:
                p_value += p

    if side == "greater":
        interval95 = [np.exp(np.log(odds_ratio) - (1.645 * se)), np.inf]
    elif side == "less":
        interval95 = [0, np.exp(np.log(odds_ratio) + (1.645 * se))]
    elif side == "two.sided":
        interval95 = [
            np.exp(np.log(odds_ratio) - (1.96 * se)),
            np.exp(np.log(odds_ratio) + (1.96 * se))

    return odds_ratio, np.array(interval95), p_value
        def prob_nohave_card_never_mulled(copies=2, hand_size=3):
            deck_size = 30
            prb = hypergeom(deck_size, copies, hand_size)
            # P(initial_miss)
            p_none_premul = prb.cdf(0)

            # GIVEN that we mul our first 3 what is prob we still are unlucky
            # P(miss_turn0 | initial_miss)
            prb = hypergeom(deck_size - hand_size, copies, hand_size)
            p_none_in_mul = prb.cdf(0)
            # TODO: add constraints about 2 drops
            #  P(miss_turn0) = P(miss_turn0 | initial_miss) *  P(initial_miss)
            p_none_at_start = p_none_in_mul * p_none_premul
            return p_none_at_start
def Exp1(A, m, r, k):
    Compute the expected number buckets that has collision less or equal 10 by applying approximation1
    (Usually apply this approximation when A>500)
    :param A: number of all patients
    :param m: number of buckets
    :param r: ratio of umber of patients satisfying certain criteria t0number of all patients
    :param k: K in K-anonymity
    :return: Expectation by applying approximation1

    B = int(A * r)  # number of patients satisfying certain criteria
    expectation = Decimal(0)
    alpha = 1 - 1 / (2 * m)
    # Restrit an interval for single bucket size (|A1| in formula) with probability greater than 1-alpha
    rv_a = binom(A, 1 / m)
    (lb_a, ub_a) = rv_a.interval(alpha)
    rv_b = hypergeom(A, int(lb_a), B)
    (lb_b, ub_b) = rv_b.interval(alpha)
    # Rule out the case that there is no collision
    if lb_b == 0 or lb_a == 0:
        for a in range(int(lb_a), int(ub_a) + 1):
            if a > k:
                # Find lowerbound and upperbound for B1
                rv_b = hypergeom(A, a, B)
                (lb_b, ub_b) = rv_b.interval(alpha)
                # Rule out the case that there is no collision
                lb_b = max(1, lb_b)
                # Compute P(|e| < k | |A1|)
                p = P(lb_b, ub_b, k, rv_b, a)
                #Compute Expectation
                expectation = expectation + p * Decimal(rv_a.pmf(a))
                rv_b = hypergeom(A, a, B)
                expectation = expectation + Decimal(
                    rv_a.pmf(a)) * (1 - Decimal(rv_b.pmf(0)))
        for a in range(int(lb_a), int(ub_a) + 1):
            # when  |A1| < k, P(|e| <= k | A1,B1) = 0
            if a > k:
                # Restrit an interval for B1 with probability greater than 0.99995
                rv_b = hypergeom(A, a, B)
                (lb_b, ub_b) = rv_b.interval(0.99995)
                # Compute P(|e|<=k | |A1|)
                p = P(lb_b, ub_b, k, rv_b, a)
                #Compute Expectation
                expectation = expectation + p * Decimal(rv_a.pmf(a))
                expectation = expectation + Decimal(rv_a.pmf(a))
    return round(expectation * m, 5)
    def add_counting_bound_constraints_1(self):
        """Adds counting bound, for a given number of cliques zonked.

        # FIXME this is half-baked
        # the probability of some number of cliques being zonked

        # loop through the number of cliques left over
        for j in range(self.max_cliques_remaining + 1):

            # for i in range(self.max_cliques_zeroed+1):

            # bounds on number of cliques containing edge e
            # (these won't actually be zeroed)
            min_cliques_zeroed = max(0,
                                     num_cliques - self.max_cliques_remaining)
            max_cliques_zeroed = min(num_cliques, self.max_cliques_zeroed)
            # the probability of some number of cliques containing edge e
            h = hypergeom(
                # number of possible cliques
                # number of those present
                # number of cliques which could intersect edge e
            # here, z is the number of cliques which _do_ intersect edge e
            A = [((z, num_cliques - z), h.pmf(z))
                 for z in range(min_cliques_zeroed, max_cliques_zeroed + 1)]
            # the bound is half the number of functions
            b = (comb(self.max_cliques, num_cliques, exact=True) - 1) / 2
            self.add_constraint(A, b)
    def add_total_cliques_counting_bound_constraints(self):
        """Adds counting bound, based on total number of cliques.

        For each "level" of "total number of cliques found", this
        adds a bound, based on the counting bound.
        # loop through the number of cliques
        for num_cliques in range(self.max_cliques + 1):
            # bounds on number of cliques containing edge e
            # (these won't actually be zeroed)
            min_cliques_zeroed = max(0,
                                     num_cliques - self.max_cliques_remaining)
            max_cliques_zeroed = min(num_cliques, self.max_cliques_zeroed)
            # the probability of some number of cliques containing edge e
            h = hypergeom(
                # number of possible cliques
                # number of those present
                # number of cliques which could intersect edge e
            # here, z is the number of cliques which _do_ intersect edge e
            A = [((z, num_cliques - z), h.pmf(z))
                 for z in range(min_cliques_zeroed, max_cliques_zeroed + 1)]
            # the bound is half the number of functions
            b = (comb(self.max_cliques, num_cliques, exact=True) - 1) / 2
            self.add_constraint(A, b)
    def test_discrete_induced_sampling(self):
        nmasses1 = 10
        mass_locations1 = np.geomspace(1.0, 512.0, num=nmasses1)
        #mass_locations1 = np.arange(0,nmasses1)
        masses1 = np.ones(nmasses1, dtype=float) / nmasses1
        var1 = float_rv_discrete(name='float_rv_discrete',
                                 values=(mass_locations1, masses1))()
        nmasses2 = 10
        mass_locations2 = np.arange(0, nmasses2)
        # if increase from 16 unmodififed becomes ill conditioned
        masses2 = np.geomspace(1.0, 16.0, num=nmasses2)
        #masses2  = np.ones(nmasses2,dtype=float)/nmasses2
        masses2 /= masses2.sum()
        var2 = float_rv_discrete(name='float_rv_discrete',
                                 values=(mass_locations2, masses2))()
        self.help_discrete_induced_sampling(var1, var2, 30)

        num_type1, num_type2, num_trials = [10, 10, 9]
        var1 = stats.hypergeom(num_type1 + num_type2, num_type1, num_trials)
        var2 = var1
        self.help_discrete_induced_sampling(var1, var2, 300)

        num_type1, num_type2, num_trials = [10, 10, 9]
        var1 = stats.binom(10, 0.5)
        var2 = var1
        self.help_discrete_induced_sampling(var1, var2, 300)

        N = 10
        xk, pk = np.arange(N), np.ones(N) / N
        var1 = float_rv_discrete(name='discrete_chebyshev', values=(xk, pk))()
        var2 = var1
        self.help_discrete_induced_sampling(var1, var2, 30)
def _calc_score(
    if prob_fn is None:
        prob_fn = 'hypergeom'

    assert prob_fn in ['hypergeom', 'binom']

    if back_hit_size <= 0:
        return 0

    k = fore_hit_size
    n = fore_size
    K = back_hit_size
    N = back_size
    p = K / N

    if prob_fn == 'hypergeom':
        binomial = stats.hypergeom(N, K, n)
        binomial = stats.binom(n, p)

    pr_gt_k = binomial.sf(k - 1)
    pr_lt_k = binomial.cdf(k)

    if pr_lt_k <= 0:
        return -200
    elif pr_gt_k <= 0:
        return 200
        return -np.log10(pr_gt_k / pr_lt_k)
    def get_enrichment_score(self, query_id_set_n, M, overlap_n):
     #   overlap = query_id_set.set & self.set
     #   k = len(overlap)
        pv = hypergeom(M, self.n, query_id_set_n).sf(overlap_n)
        print "m=" +str(M) + " n=" + str(self.n) + " q=" + str(query_id_set_n) + " k=" + str(overlap_n) + " pv=" + str(pv)

        return pv # EnrichmentScore(pv, k, overlap, self.name)
def run(domain_name='X', projection_name='Y8'):
    prob2 = sio.loadmat('prob2.mat')

    domain_names = ['X', 'XV', 'Y', 'Y8', 'Y12']

    domains = [prob2.get(d) for d in domain_names]
    #domain_clusters = [prob2.get('ids_' + d) for d in domain_names]
    tissue_clusters = prob2.get('tissue_category')

    clusters = domain_clusters[domain_names.index(domain_name)]
    pdom = domains[domain_names.index(projection_name)]
    cdom = domains[domain_names.index(domain_name)]

    f = plt.figure(1)
    ct = array(mc.getct(218))

    #px, py = 2, 2
    sstrings = ['21{0:d}'.format(i + 1) for i in range(4)]

    inds = arange(shape(dom)[1])

    c_inds = array(clusters).flatten() - 1
    tc_inds = tissue_clusters.flatten() - 1

    colors = ct[c_inds, :]

    ax = f.add_subplot(sstrings[0], title = \
                         'Clusters from genespace affinity. Projection to first two elements')
    ax.scatter(*cdom[inds, 0:2].T, s=100, c=colors)
    ax = f.add_subplot(sstrings[1], title = \
                       'Clusters from genespace affinity. Projection to MVE')
    ax.scatter(*pdom[inds, 0:2].T, s=100, c=colors)

    cpairs = set([
        '{0:d}x{1:d}'.format(ix, iy) for ix, x in enumerate(c_inds)
        for iy, y in enumerate(c_inds) if ix < iy and x == y
    tcpairs = set([
        '{0:d}x{1:d}'.format(ix, iy) for ix, x in enumerate(tc_inds)
        for iy, y in enumerate(tc_inds) if ix < iy and x == y
    f.savefig('figs/cluster_projectsions.tiff', format='tiff')

    max_pairs = (len(tc_inds) * len(tc_inds) - len(tc_inds)) / 2
    total_pairs = len(cpairs.union(tcpairs))
    shared_pairs = len(cpairs.intersection(tcpairs))

    print 'using affinity propagation with affinites over domain {0}'.format(
    print 'found'
    print ' max pairs: {0}'.format(max_pairs)
    print ' total pairs: {0}'.format(total_pairs)
    print ' tissue pairs: {0}'.format(len(tcpairs))
    print ' cluster pairs: {0}'.format(len(cpairs))
    print ' shared pairs: {0}'.format(shared_pairs)

    hg = hypergeom(len(tcpairs), len(cpairs), max_pairs)
    return hg
    def test_get_univariate_leja_rule_bounded_discrete(self):
        growth_rule = partial(constant_increment_growth_rule, 2)
        level = 3

        nmasses = 20
        xk = np.array(range(0, nmasses), dtype='float')
        pk = np.ones(nmasses) / nmasses
        var_cheb = float_rv_discrete(name='discrete_chebyshev',
                                     values=(xk, pk))()

        for variable in [
                stats.binom(17, 0.5),
                stats.hypergeom(10 + 10, 10, 9)
            quad_rule = get_univariate_leja_quadrature_rule(
                variable, growth_rule)

            x, w = quad_rule(level)
            loc, scale = transform_scale_parameters(variable)
            x = x * scale + loc

            xk, pk = get_probability_masses(variable)
            print(x, xk, loc, scale)

            degree = (x.shape[0] - 1)
            true_moment = (xk**degree).dot(pk)
            moment = (x**degree).dot(w[-1])

            print(moment, true_moment, variable.dist.name)
            assert np.allclose(moment, true_moment)
def hypergeom_p_values(data, selected, callback=None):
    Calculates p_values using Hypergeometric distribution for two numpy arrays.
    Works on a matrices containing zeros and ones. All other values are truncated to zeros and ones.

    :param data: all examples in rows, theirs features in columns
    :type data: numpy.array
    :param selected: selected examples in rows, theirs features in columns
    :type selected: numpy.array
    :return: p-values for features
    if data.shape[1] != selected.shape[1]:
        raise ValueError("Number of columns does not match.")

    # clip values to a binary variables
    data = data > 0
    selected = selected > 0

    num_features = selected.shape[1]
    pop_size = data.shape[0]                # population size = number of all data examples
    sam_size = selected.shape[0]            # sample size = number of selected examples
    pop_counts = np.sum(data, axis=0)       # number of observations in population = occurrences of words all data
    sam_counts = np.sum(selected, axis=0)   # number of observations in sample = occurrences of words in selected data

    step = 250
    p_vals = []

    for i, (pc, sc) in enumerate(zip(pop_counts, sam_counts)):
        hyper = stats.hypergeom(pop_size, pc, sam_size)
        # since p-value is probability of equal to or "more extreme" than what was actually observed
        # we calculate it as 1 - cdf(sc-1). sf is survival function defined as 1-cdf.
        if callback and i % step == 0:
    return p_vals
def hyper(N,M,n,m): 
    ''' Function defines the parameters for a hypergeometric test that returns a p-value representing the chances of identifying >= x, where x is the number of successes '''  
    ms=np.arange(m, min(n+1, M+1))
    for single_m in ms: rv=rv+frozendist.pmf(single_m)
    return rv
def pvalue(N, M, n, m):
    N = deepcopy(N)
    M = deepcopy(M)
    n = deepcopy(n)
    m = deepcopy(m)
    maxlen = max([length(N), length(M), length(n), length(m)])
    if maxlen > 1:
        if length(N) == 1:
            N = [N for i in range(maxlen)]
        elif length(N) != maxlen:
            raise ValueError('Inequally long vectors have been provided to this function')
        if length(M) == 1:
            M = [M for i in range(maxlen)]
        elif length(M) != maxlen:
            raise ValueError('Inequally long vectors have been provided to this function')
        if length(n) == 1:
            n = [n for i in range(maxlen)]
        elif length(n) != maxlen:
            raise ValueError('Inequally long vectors have been provided to this function')
        if length(m) == 1:
            m = [m for i in range(maxlen)]
        elif length(m) != maxlen:
            raise ValueError('Inequally long vectors have been provided to this function')
        return [pvalue(N[i],M[i],n[i],m[i]) for i in range(maxlen)]
        hg = sps.hypergeom(N, M, n)
        if m > M or m > n:
            m = min(M, n)
        return sum(hg.pmf(np.arange(m, min(M + 1, n + 1))))
def compute_clusters_ps(predicted_clusters, goa_clusters):
    predicted_clusters = {
        a: p
        for a, p in predicted_clusters.items() if len(p) >= 3
    goa_clusters = {a: p for a, p in goa_clusters.items() if len(p) >= 3}
    n_total_proteins = sum(len(p) for p in goa_clusters.values())

    top_p_values = {}

    for predict_cluster, predict_proteins in tqdm(predicted_clusters.items()):
        p_value = float('inf')
        top_goa_cluster = None
        for goa_cluster, goa_proteins in goa_clusters.items():
            n_goa_proteins = len(goa_proteins)
            n_predicted_proteins = len(predict_proteins)
            n_proteins_from_goa = len(
            goa_c_p_value = ss.hypergeom(
                n_total_proteins, n_goa_proteins,
                n_predicted_proteins).sf(n_proteins_from_goa - 1)
            if goa_c_p_value < p_value:
                p_value = goa_c_p_value
                top_goa_cluster = goa_cluster
        top_p_values[predict_cluster] = (top_goa_cluster, p_value)

    return top_p_values
    def add_total_cliques_equality_constraints(self):
        """Adds constraints for a given total number of cliques.

        For 0 <= m <= N, these define a variable '(total_cliques, m)',
        which is E[ number of gates need to find m cliques ],
        or "the expected number of gates needed at 'level m'".
        It's constrained to equal the weighted average of FIXME describe this.
        # loop through the number of cliques
        for num_cliques in range(self.max_cliques + 1):
            # bounds on number of cliques containing edge e
            # (these won't actually be zeroed)
            min_cliques_zeroed = max(0,
                                     num_cliques - self.max_cliques_remaining)
            max_cliques_zeroed = min(num_cliques, self.max_cliques_zeroed)
            # the probability of some number of cliques containing edge e
            h = hypergeom(
                # number of possible cliques
                # number of those present
                # number of cliques which could intersect edge e
            # here, z is the number of cliques which _do_ intersect edge e
            A = [((z, num_cliques - z), h.pmf(z))
                 for z in range(min_cliques_zeroed, max_cliques_zeroed + 1)]
            # this is constraining the total number of gates at this "level"
            # to equal the average, weighted by the probability of some
            # number of cliques being zeroed out
            self.add_constraint(A + [(('total_cliques', num_cliques), -1.0)],
                                0, True)
def get_enriched(all_genes, selection, name, method, cutoff, print_all):
    """Get enrichment for pfam domains."""
    all_counts = Counter(all_genes)
    sel_counts = Counter(selection)
    df = pd.DataFrame({
        "all": pd.Series(all_counts),
        name: pd.Series(sel_counts)

    # Hypergeometric test
    M = df["all"].sum()
    N = df[name].sum()
    df["p_value"] = df.apply(
        lambda x: hypergeom(M, x["all"], N).sf(x[name] - 1), axis=1)

    # Multiple test correction
    corr = "fdr_bh" if method == "bh" else method
    df[corr] = multipletests(df["p_value"], method=corr)[1]
    df = df.sort_values(corr)
    df["significant"] = df[corr] <= cutoff

    # Add pfam domain and description columns
    df = df.reset_index().rename(columns={"index": "pfam_domain"})

    if not print_all:
        df = df.loc[df["significant"]]

    df.insert(1, "description", df["pfam_domain"].map(get_pfam_desc))
    return df
def find_hypergeometric(genes, pred_no_training):

    overlap = list(set(genes) & set(pred_no_training))
    M = 10683
    N = len(genes)
    n = len(pred_no_training)
    x = len(overlap)
    pval = hypergeom.sf(x - 1, M, n, N)

    rv = hypergeom(M, n, N)
    distr = np.arange(0, n + 1)
    #print (N, n, x)
    prob = rv.pmf(distr)

    maximum = np.max(prob)
    result = np.where(prob == maximum)
    #print (result)
    result = result[0]
    #print (result)
    fold = x / result
    fold = fold.tolist()
    print('Fold Enrichment', fold)
    print('hypergeometric p-value', pval)
    return fold
def hypergeometric_test(X, cluster, treshold):
    # type: (np.ndarray, np.ndarray, float) -> np.ndarray

    scores = np.zeros((X.shape[1],))

    # Binary expression matrix
    Y = (X >= treshold).astype(int)

    # Process each gene
    for gi, g in enumerate(Y.T):
        # Test parameters
        M = X.shape[0]  # Number of cells
        n = g.sum()  # Number of cells expressing g
        N = len(cluster)  # Number of cells belonging to cluster(s)
        hg = hypergeom(M, n, N)

        # Test for over expression
        x = g[cluster].sum()
        x_over = np.arange(x, n + 1)  # x or more
        pvalue_over = hg.pmf(x_over).sum()

        # Test for under expression
        x_under = np.arange(0, x + 1)  # x or less
        pvalue_under = hg.pmf(x_under).sum()

        # Proposed scoring:
        p = min(pvalue_under, pvalue_over)
        s = -1 if pvalue_under < pvalue_over else 1
        score = -np.log(p) * s
        scores[gi] = score

    return scores
    def test_get_univariate_leja_rule_bounded_discrete(self):
        from scipy import stats
        growth_rule = partial(constant_increment_growth_rule, 2)
        level = 3

        nmasses = 20
        xk = np.array(range(0, nmasses), dtype='float')
        pk = np.ones(nmasses) / nmasses
        var_cheb = float_rv_discrete(name='discrete_chebyshev',
                                     values=(xk, pk))()

        for variable in [
                stats.binom(20, 0.5),
                stats.hypergeom(10 + 10, 10, 9)
            quad_rule = get_univariate_leja_quadrature_rule(
                variable, growth_rule)

            # polys of binom, hypergeometric have no canonical domain [-1,1]
            x, w = quad_rule(level)

            from pyapprox.variables import get_probability_masses
            xk, pk = get_probability_masses(variable)
            true_moment = (xk**(x.shape[0] - 1)).dot(pk)
            moment = (x**(x.shape[0] - 1)).dot(w[-1])

            assert np.allclose(moment, true_moment)
def family_hg(cluster_p_val_dict, mol_families, p_thresh=0.01):
    # takes as input a dictionary that maps clusters to their p-vals
    # the key is a cluster id, and the value is another dictionary
    # that should have a pval field
    from scipy.stats import hypergeom
    import numpy as np
    # compute the hypergeometric business
    fam_clust_sig = []
    for mf in mol_families:
        local_n_sig = 0
        n_clu = 0
        for c in mf.clusters:
            if c.cluster_id in cluster_p_val_dict:
                n_clu += 1
                if cluster_p_val_dict[c.cluster_id]['pval'] <= p_thresh:
                    local_n_sig += 1
        fam_clust_sig.append((mf, n_clu, local_n_sig))

    N = len(cluster_p_val_dict)
    pvallist = []
    for c in cluster_p_val_dict:

    n_sig = len(list(filter(lambda x: x <= p_thresh, pvallist)))
    fam_clust_sig_hyp = []
    for fam, n_clu, local_n_sig in fam_clust_sig:
        rv = hypergeom(N, n_sig, n_clu)
        poss = np.arange(local_n_sig, n_clu + 1)
        hypp = rv.pmf(poss).sum()
        fam_clust_sig_hyp.append((fam, hypp, n_clu, local_n_sig))
    fam_clust_sig_hyp.sort(key=lambda x: x[1])
    return fam_clust_sig_hyp
def generate_scores(_ids, _scores, _spectra, _kernel, _params):
    res = _params['fragment mass tolerance']
    sfactor = 20
    sadjust = 1
    if res > 100:
        sfactor = 40
    sd = {}
    for j in _ids:
        p_score = 0.0
        if not _ids[j]:
        for i in _ids[j]:
            kern = _kernel[i]
            lseq = list(kern['seq'])
            pmass = int(kern['pm'] / 1000)
            cells = int(pmass - 200)
            if cells > 1500:
                cells = 1500
            total_ions = 2 * (len(lseq) - 1)
            if total_ions > sfactor:
                total_ions = sfactor
            if total_ions < _scores[j]:
                total_ions = _scores[j] + 1
            sc = len(_spectra[j]['sms']) / 3
            if _scores[j] >= sc:
                sc = _scores[j] + 2
            rv = hypergeom(cells, total_ions, sc)
            p = rv.pmf(_scores[j])
            pscore = -100.0 * math.log10(p) * sadjust
            sd[(j, i)] = pscore
    return sd
def _calc_score(
    fore_hit_size, fore_size, back_hit_size, back_size,
    if prob_fn is None:
        prob_fn = "hypergeom"

    assert prob_fn in ["hypergeom", "binom"]

    if back_hit_size <= 0:
        return 0

    k = fore_hit_size
    n = fore_size
    K = back_hit_size
    N = back_size
    p = K / N

    if prob_fn == "hypergeom":
        binomial = stats.hypergeom(N, K, n)
        binomial = stats.binom(n, p)

    pr_gt_k = binomial.sf(k - 1)
    pr_lt_k = binomial.cdf(k)

    if pr_lt_k <= 0:
        return -200
    elif pr_gt_k <= 0:
        return 200
        return -np.log10(pr_gt_k / pr_lt_k)
 def chug_count_distribution(cls, player_count):
     # Exact probability
     N = 13 * player_count
     K = player_count
     n = 13
     rv = hypergeom(N, K, n)
     return rv.pmf, f"HyperGeometric({N}, {K}, {n})"
    def __init__(self,

        if M < 0:
            raise VariableInputError(
                'HypergeometricVariable M must be greater or equal to 0.')

        if n < 0:
            raise VariableInputError(
                'HypergeometricVariable n must be greater or equal to 0.')

        if (N < 1) or (N > (M + n)):
            raise VariableInputError(
                'HypergeometricVariable M must be greater than 1 and less than M+n.'

        self.interval_shift = interval_shift
        self.order = order
        self.M = M
        self.n = n
        self.N = N
        self.type = UncertaintyType.from_name(type)
        self.name = f'x{number}' if name == '' else name
        self.var_str = f'x{number}'
        self.x = symbols(self.var_str)

        self.distribution = Distribution.HYPERGEOMETRIC

        self.dist = hypergeom(M=self.M + self.n,


        self.recursive_var_basis(self.x_values, self.probabilities, self.order)
        self.create_norm_sq(self.x_values, self.probabilities)

        self.low_approx = np.min(self.x_values)
        self.high_approx = np.max(self.x_values)
        self.std_bounds = (self.low_approx, self.high_approx)


        if self.type == UncertaintyType.EPISTEMIC:
            warn('The HypergeometricVariable is usually not epistemic. For an '
                 'epistemic variable, consider using the continuous uniform '
                 'distribution with type epistemic.')

        showwarning = _warn
def calc_pvalue(gene_list, gene_set, M):
    gene_list = set(gene_list)
    gene_set = set(gene_set)
    N = len(gene_list)
    n = len(gene_set)
    overlap = gene_list & gene_set
    k = len(overlap)
    return hypergeom(M, n, N).sf(k), list(overlap)
 def test_hypergeometric(self):
     N = 20
     K = 3
     p = hypergeometric(N, K)
     scipy_p = np.array(
         [hypergeom(N, K, n).pmf(range(0, K + 1)) for n in range(N + 1)])
     err = np.abs(p - scipy_p).max()
     self.assertTrue(err < 1e-10)
def calc_enrichment_score(n, o, M, N):
    # M = number of strains screened
    # n = number of screened strains with attribute
    # N = number of active strains
    # o = number of active strains with attribute
    rv = hypergeom(M, n, N)
    p_val = rv.sf(o - 1)
    return p_val
 def variance_function(theta):
     rounded_m_theta = round(theta * M)
     TP_rv = hypergeom(M=M, n=P, N=round(theta * M))
     return sum([
         TP_rv.pmf(x) * (given_x_function(x, theta)**2)
         for x in range(int(max(0, rounded_m_theta - N)),
                        int(min((P + 1, rounded_m_theta + 1))))
 def calc_pvalue(query_id_set, reference_id_set, M):
     query_id_set = set(query_id_set)
     reference_id_set = set(reference_id_set)
     N = len(query_id_set)
     n = len(reference_id_set)
     overlap = query_id_set & reference_id_set
     k = len(overlap)
     return hypergeom(M, n, N).sf(k), list(overlap)
 def hypergeometric_cdf(self, N, K, n, k):
   N= total number of genes in population
   K= number of GOA
   n= select a sample (top 50, bottom half, etc.)
   k= number of successes in the sample
     return 1 - hypergeom(N, K, n).cdf(k)
    def get_probability_density_func(self):
        Calculates the probabilities for the HypergeomericVariable 
        dist = hypergeom(M=self.M + self.n, n=self.n, N=self.N)

        self.probabilities = dist.pmf(self.x_values)
 def hypergeometric_cdf(self, N, K, n, k):
     N= total number of genes in population
     K= number of GOA
     n= select a sample (top 50, bottom half, etc.)
     k= number of successes in the sample
   return 1 - hypergeom(N, K, n).cdf(k)
def test_prob(num_hits, pop_size, num_draws, num_matching=1):
    """Perform a hypergeometric test to see the probability of drawing the
    same entity num_hits many times. Need to check math

    dist = hypergeom(pop_size, num_matching, num_draws)
    pval = dist.sf(num_hits - 1)
    return pval
def calc_pvalue(gene_list, gene_set, M):
    gene_list = set(gene_list)
    gene_set = set(gene_set)
    N = len(gene_list)
    n = len(gene_set)
    overlap = gene_list & gene_set
    k = len(overlap)
    return hypergeom(M, n, N).sf(k), list(overlap)
def run( domain_name = 'X', projection_name = 'Y8'  ):
  prob2 = sio.loadmat('prob2.mat')
  domain_names = ['X', 'XV', 'Y', 'Y8', 'Y12']
  domains = [prob2.get(d) for d in domain_names]
  #domain_clusters = [prob2.get('ids_' + d) for d in domain_names]
  tissue_clusters = prob2.get('tissue_category')

  clusters = domain_clusters[domain_names.index(domain_name)]
  pdom = domains[domain_names.index(projection_name)]
  cdom = domains[domain_names.index(domain_name)]

  f = plt.figure(1)
  ct = array(mc.getct(218))
  #px, py = 2, 2
  sstrings = ['21{0:d}'.format(i+1) for i in range(4)]
  inds = arange(shape(dom)[1])
  c_inds = array(clusters).flatten() -1
  tc_inds = tissue_clusters.flatten() -1

  colors = ct[c_inds,:]

  ax = f.add_subplot(sstrings[0], title = \
                       'Clusters from genespace affinity. Projection to first two elements')  
  ax.scatter(*cdom[inds,0:2].T,s= 100, c = colors)
  ax = f.add_subplot(sstrings[1], title = \
                     'Clusters from genespace affinity. Projection to MVE')  
  ax.scatter(*pdom[inds,0:2].T,s= 100, c = colors)

  cpairs = set(['{0:d}x{1:d}'.format(ix,iy) 
                for ix, x in enumerate(c_inds) for iy, y in enumerate(c_inds)
                if ix < iy and x == y ])
  tcpairs = set(['{0:d}x{1:d}'.format(ix,iy) 
                for ix, x in enumerate(tc_inds) for iy, y in enumerate(tc_inds)
                if ix < iy and x == y ])
  f.savefig('figs/cluster_projectsions.tiff',format = 'tiff')
  max_pairs =( len(tc_inds) * len(tc_inds)  - len(tc_inds)) / 2
  total_pairs = len(cpairs.union(tcpairs))
  shared_pairs =len(cpairs.intersection(tcpairs))

  print 'using affinity propagation with affinites over domain {0}'.format(domain_name)
  print 'found'
  print ' max pairs: {0}'.format(max_pairs)
  print ' total pairs: {0}'.format(total_pairs)
  print ' tissue pairs: {0}'.format(len(tcpairs))
  print ' cluster pairs: {0}'.format(len(cpairs))
  print ' shared pairs: {0}'.format(shared_pairs)

  hg =  hypergeom( len(tcpairs), len(cpairs), max_pairs )
  return hg
def p_value(num_genes,
    rv = hypergeom(total_genes, num_top_genes, num_genes)
    p = rv.sf(num_genes_int_top_list - 1)
    if isnan(p):  # old version of hypergeom.sf() gives NaN, yuck
        p = rv.pmf(range(num_genes_int_top_list, num_genes + 1)).sum()
    return p
def hyper_test2(X, K, n, N):
    Hypergeometric test for overexpression. Gives the probability that there are X or more events A 
    over n occurences given the total number of event K over the total number of occurences N.
    For underexpression. Note that this is note 1 - the previous, because 
    we are computing the probability that x is equal or less than X
    TODO: improve with cdf
    return sum([hypergeom(N, n, K).pmf(x) for x in range(X+1)])
 def test_rvs(self):
     vals = stats.hypergeom.rvs(20, 10, 3, size=(2, 50))
     assert numpy.all(vals >= 0) & numpy.all(vals <= 3)
     assert numpy.shape(vals) == (2, 50)
     assert vals.dtype.char in typecodes["AllInteger"]
     val = stats.hypergeom.rvs(20, 3, 10)
     assert isinstance(val, int)
     val = stats.hypergeom(20, 3, 10).rvs(3)
     assert isinstance(val, numpy.ndarray)
     assert val.dtype.char in typecodes["AllInteger"]
def hyper_test(X, K, n, N):
    Hypergeometric test for overexpression. Gives the probability that there are X or more events A 
    over n occurences given the total number of event K over the total number of occurences N.
    X: Number of events
    K: Total number of events
    n: number of occurences
    N: total number of occurences

    TODO: improve with cdf
    return 1. - sum([hypergeom(N, n, K).pmf(x) for x in range(X)])
    def baseline(self):
        """Return the baseline performance vector.

        The baseline is obtaining OOT posts by chance. Thus, the baseline
        performance vector is the probability mass function of a hypergeometric
        random variable denoting the number of OOT posts in the top N list.
        The k-th element represents the probability of getting k OOT posts in
        the top N list.
        rv = hypergeom(self.M, self.n, self.N)
        k = np.arange(self.min_sup, self.max_sup+1)
        return rv.pmf(k)
    def combo_in_top(n):
        prbA = hypergeom(N, nA, n)
        prbB = hypergeom(N, nB, n)
        prbL = hypergeom(N, nLands, n)

        # cdf is probabiliyt of k or fewer successes
        # prb.cdf(0)

        p_L_eq0 = prbL.cdf(0)
        p_L_le1 = prbL.cdf(1)
        p_L_le4 = prbL.cdf(4)
        # having between 2 to 4 lands
        p_L_ge2_le4 = p_L_le4 - p_L_le1
        p_keepable = p_L_ge2_le4

        # probability of having none
        p_A_eq0 = prbA.cdf(0)
        p_B_eq0 = prbB.cdf(0)
        # probability of having at least 1
        p_A_ge1 = 1 - p_A_eq0
        p_B_ge1 = 1 - p_B_eq0
        # http://math.stackexchange.com/questions/72589/calculating-probability-of-at-least-one-event-occurring

        def p_not_any_fail(p_A_fail, p_B_fail):
            p_and = (1 - p_A_fail) + (1 - p_B_fail) - (1 - p_A_fail * p_B_fail)
            return p_and

        p_and = (1 - p_A_eq0) + (1 - p_B_eq0) - (1 - p_A_eq0 * p_B_eq0)
        p_and = p_A_eq0 * p_B_eq0 - p_A_eq0 - p_B_eq0 + 1

        p_nor = p_A_eq0 * p_B_eq0  # chance_of_neither_combo_card
        p_or = 1 - p_nor  # chance of either card
        p_and = p_A_ge1 + p_B_ge1 - p_or  # chance of both cards
        p_xor = p_or - p_and  # change of either A or B but not both

        print("p_and = %r" % (p_and,))

        p_not_any_fail(1 - p_and, 1 - p_keepable)
def hg_p_value(n_parent,k_parent,n_child,k_child):
    one-tailed hypothesis test.
    H0: The partition is random.
    hg = spst.hypergeom(n_parent,k_parent,n_child)
    parent_mean = n_child*(k_parent*1.0/n_parent)
    if k_child <= parent_mean:
        #then we want to know what the probability is
        #that we would observe a result as extreme as this one.
        return max(0.0,hg.cdf(k_child))
        return max(1-hg.cdf(k_child-1),0.0)    
def get_sender_pvals(addrCounts):
    M = sum([t[0] for t in addrCounts.values()])
    N = sum([t[1] for t in addrCounts.values()])
    low = []
    high = []
    for k,t in addrCounts.items():
        h = stats.hypergeom(N, M, t[1])
        low.append((h.cdf(t[0]), t[0], t[1], k))
        p = h.sf(t[0] - 1)
        if isnan(p): # old version of hypergeom.sf() gives NaN, yuck
            p = h.pmf(range(t[0], t[1] + 1)).sum()
        high.append((p, t[0], t[1], k))
    return low, high
def calculate_enrichment(pathway_matrix, gene_set):
    """Calculate hypergoemotric enrichment of the set for each pathway

    The pathway matrix should have pathways in rows and genes in columns
    # only consider genes which are known to be in pathways
    pathway_gene_list = gene_set.intersection(pathway_matrix.columns)
    # Generate hypergeometric distributions for each pathway. Each
    # pathway needs its own because they have different lenghts
    distributions = [hypergeom(len(pathway_matrix.columns), l,
                     for l in pathway_matrix.sum(axis=1)]
    pathway_hits = pathway_matrix[pathway_gene_list].sum(axis=1)
    # Each p-value for the hypergeometric enrichment is
    # survival function + 0.5 * pmf
    significance = [dist.sf(x) + 0.5 * dist.pmf(x)
                    for x, dist in zip(pathway_hits, distributions)]
    return Series(significance, index=pathway_matrix.index)
def Hypergeometric(N, n, K, tag=None):
    A Hypergeometric random variate
    N : int
        The total population size
    n : int
        The number of individuals of interest in the population
    K : int
        The number of individuals that will be chosen from the population
    (Taken from the wikipedia page) Assume we have an urn with two types of
    marbles, 45 black ones and 5 white ones. Standing next to the urn, you
    close your eyes and draw 10 marbles without replacement. What is the
    probability that exactly 4 of the 10 are white?
        >>> black = 45
        >>> white = 5
        >>> draw = 10
        # Now we create the distribution
        >>> h = H(black + white, white, draw)
        # To check the probability, in this case, we can use the underlying
        #  scipy.stats object
        >>> h.rv.pmf(4)  # What is the probability that white count = 4?
    assert (
        int(N) == N and N > 0
    ), 'Hypergeometric total population size "N" must be an integer greater than zero.'
    assert (
        int(n) == n and 0 < n <= N
    ), 'Hypergeometric interest population size "n" must be an integer greater than zero and no more than the total population size.'
    assert (
        int(K) == K and 0 < K <= N
    ), 'Hypergeometric chosen population size "K" must be an integer greater than zero and no more than the total population size.'
    return uv(ss.hypergeom(N, n, K), tag=tag)
def enrichment(dbfx,inp):
	print("Database :"+str(dbfx))
	print("Input file:"+str(inp))
	fout = codecs.open("Enrichment_results.csv",'w',encoding = "utf8")
	input = []
	glst = []
	input = set(getinput(inp))
	db = {}
	db,glst = database(dbfx)
	M = len(glst)
	for d in db.keys():
		overlap = len(input.intersection(set(db[d])))
		if overlap > 0:
			ora = hypergeom(M,len(set(db[d])),len(input))
			p = ora.pmf(overlap)
#			print(str(M)+"\t"+str(len(set(db[d])))+"\t"+len(input)+"\t"+str(overlap)+str(p)+"\n")
def partition_htest_value(n_parent,k_parent,n_child,k_child,alpha,cache=False):
    tests a partition of k_parent +1s in n_parent (+1/-1)s.
    Returns (min,max) which are endpoints of (1-alpha)% 
    confidence interval for H0 = partition is random.
    if n_child == 1:
        if 1.0*k_parent/n_parent < alpha/2.0 and k_parent == 1:
            return True
        elif 1.0*k_parent/n_parent > 1-alpha/2.0 and k_parent == 0:
            return True
            return False
    if cache==False:
        hg = spst.hypergeom(n_parent,k_parent,n_child)
        c = hg.cdf([k_child-1,k_child])
        #okay, so c is the cdf INCLUDING k.
        #if that's greater than alpha/2, then we throw away the coeff.
        #if that's less than 1-alpha/2 we don't know. But if the cdf for one
        #less is less than 1-alpha/2, we throw away the coeff.  
        return not ((c[1] > alpha/2.0) and (c[0] < (1.0-alpha/2.0)))
        hgt = (n_parent,k_parent,n_child,alpha)
        print hgt
        if hgt in _h_test_dict:
            left,right = _h_test_dict[hgt]
            print "saved one"
            left,right = partition_htest(*hgt)
            _h_test_dict[hgt] = (left,right)
        if left == -1 and right == -1:
            return False
        elif left == -1 and k_child > right:
            return True
        elif k_child < left and right == -1:
            return True
        elif k_child < left or k_child > right:
            return True
            return False
    def calculateByDraw(self):
        # Method to calculate and display the probabilites for the number of successful draws out of a single pool of OUTS

        # hypergeometric formula: assumes draws are just total # of draws
        rv = hypergeom(self.remainingDeckSizeSpinBox.value(), self.numberOfOutsInDeckSpinBox.value(),
        outs = np.arange(0, self.numberOfOutsInDeckSpinBox.value() + 1)
        draws = np.arange(0, self.numberOfDrawsSpinBox.value() + 1)
        PMF = rv.pmf(outs)

        odds = np.empty(self.numberOfOutsInDeckSpinBox.value() + 1, dtype=float)
        surviveodds = np.empty(self.numberOfOutsInDeckSpinBox.value()+1, dtype=float)
        self.probabilityTable.setRowCount(len(outs) + 1)

        # initialize the table. This code looks terrible

        i = 0
        j = 0
        while i < self.probabilityTable.columnCount():
            while j < self.probabilityTable.rowCount():
                matrixElement = QtGui.QTableWidgetItem()
                self.probabilityTable.setItem(j, i, matrixElement)
                j += 1
            i += 1
            j = 0

        item = self.probabilityTable.item(0, 0)
        item.setText(_translate("MainWindow", "Exactly", None))
        item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter)
        item = self.probabilityTable.item(0, 1)
        item.setText(_translate("MainWindow", "Probability", None))
        item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter)
        item = self.probabilityTable.item(0, 2)
        item.setText(_translate("MainWindow", "At Least", None))
        item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter)
        item = self.probabilityTable.item(0, 3)
        item.setText(_translate("MainWindow", "Probability", None))
        item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter)

        #Loop to build the table and calculate the PMF and SF based on inputs
        if len(outs)> len(draws):
            maxSuccess = draws
            maxSuccess = outs
        for out in maxSuccess:
            # creating spaces in table
            matrixElement = QtGui.QTableWidgetItem()
            self.probabilityTable.setItem(out + 1, 0, matrixElement)
            matrixElement = QtGui.QTableWidgetItem()
            self.probabilityTable.setItem(out + 1, 1, matrixElement)

            # Calculate PMF for each out
            odds[out] = rv.pmf(out)
            surviveodds[out] = rv.sf(out)

            # populate the table
            # exact outs
            item = self.probabilityTable.item(out + 1, 0)
            item.setText(_translate("MainWindow", "{0:d}".format(out), None))
            item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter)
            item = self.probabilityTable.item(out + 1, 1)
            item.setText(_translate("MainWindow", "{0:3f}".format(odds[out]), None))
            item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter)

            # Atleast outs
            if out != 0:  # atleast zero outs is meaningless
                item = self.probabilityTable.item(out + 1, 2)
                item.setText(_translate("MainWindow", "{0:d}".format(out), None))
                item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter)
                item = self.probabilityTable.item(out + 1, 3)
                item.setText(_translate("MainWindow", "{0:3f}".format(surviveodds[out-1]), None))
                item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter)
        # create the pmf graph)

        print("The odds of getting at least {0:d} outs is {1:3f}".format(out, odds[out]))

        #### Bar graphs if I want to add them later

        ax = self.figure.add_subplot(111)
        width = 0.3
        ax.bar(outs, PMF, width, color = 'r' )
        ax.bar(outs+(1.2*width+1), surviveodds, width, color = 'b')
        ax.set_xticks(outs + width)
import matplotlib.pyplot as plt
from scipy.stats import hypergeom, rv_discrete
import numpy as np
numargs = hypergeom.numargs
#[ M, n, N ] = [100, 10, -1]

#Display frozen pmf:

rv = hypergeom( 10, 20, 3 )
print rv.dist.b
x = np.arange( 0, np.min( rv.dist.b, 3 ) + 1 )
h = plt.plot( x, rv.pmf( x ) )
#Check accuracy of cdf and ppf:

prb = hypergeom.cdf( x, M, n, N )
h = plt.semilogy( np.abs( x - hypergeom.ppf( prb, M, n, N ) ) + 1e-20 )

#Random number generation:

R = hypergeom.rvs( M, n, N, size=100 )

#Custom made discrete distribution:

vals = [np.arange( 7 ), ( 0.1, 0.2, 0.3, 0.1, 0.1, 0.1, 0.1 )]
custm = rv_discrete( name='custm', values=vals )
h = plt.plot( vals[0], custm.pmf( vals[0] ) )

 def sample(self, x=None):
     return hypergeom(self.M, self.X, self.m).rvs(x, random_state=self.random)
def calculate_enrichment(N=100):
    # You need to replace this with something useful
    experiment_dict = eas.experiment();
    counttop100 = {}
    countbot100 = {}
    goid_prob_top = {}
    goid_prob_bot = {}
#initialize dictionary of goids
    with open("go_info.txt", 'r') as target:
        for line in target:
            lines = line.split();
            counttop100[lines[0]] = np.zeros(32) #goid counts in top 100
            countbot100[lines[0]] = np.zeros(32) #goid counts in top 100
            goid_prob_top[lines[0]] = np.zeros(32) #goid prob
            goid_prob_bot[lines[0]] = np.zeros(32) #goid prob
    #add values for goid
    for i in range(0, 33):
        sorted_genes = experiment_dict[i].sort(key=lambda tup: tup[1])
        top100 = sorted_genes[0:100]
        bot100 = sorted_genes[-100:0]
        #go hrough top 100, want list of goIds, count for top/bot100
        for j in range(0, 100):
            counttop100[ gene_to_go[ top100[j][0] ] ][i] += 1
            countbot100[ gene_to_go[ top100[j][0] ] ][i] += 1
        #hypergeom, what is the probability i got that many counts from top 100
        for j in counttop100:
            gene_per_go = len(go_to_gene[j])
            [ M, n, N] = [4767, gene_per_go, N];
            rv = scistat.hypergeom(M, n, N)
            x = np.arange(0, counttop100[j][i] + 1)
            survival_exp = rv.sf(x)
            goid_prob_top[j][i] = survival_exp

        for j in countbot100:
            gene_per_go = len(go_to_gene[j])
            [ M, n, N] = [4767, gene_per_go, N];
            rv = scistat.hypergeom(M, n, N)
            x = np.arange(0, countbot100[j][i] + 1)
            survival_exp = rv.sf(x)
            goid_prob_bot[j][i] = survival_exp

#for j in experiment_dict[i]:
        #    mainstuff[ gene_to_go[j[0]] ][i] += j[1]
#sort by exp values
#        mainstuff.sort(key=lambda tup: tup[1])
#take top 100

    heatmaptop = plt.pcolor(goid_prob_top);
    heatmapbot = plt.pcolor(goid_prob_bot);

    positive_enrichment_scores = goid_prob_top;
    negative_enrichment_scores = goid_prob_bot;

    return positive_enrichment_scores,negative_enrichment_scores
    clu_index = 0
    for clu in clusters:
        clu_index += 1
        if clu_index > 100: break
        clu_set = clu

        clu_sig_score = 0
        clu_sig_smallest = -1;
        for bic in biclusters:
            bic_size = len(bic)
            clu_size = len(clu_set)
            overlap = len(bic & clu_set)

            hyper = hypergeom(OPR_COUNT, bic_size, clu_size)
            hypersf = hyper.sf(overlap)
            #print("%d\t%d\t%d\t%.3f" % (bic_size, clu_size, overlap, hypersf))
            if clu_sig_smallest < 0:
                clu_sig_smallest = hypersf
            elif clu_sig_smallest > hypersf:
                clu_sig_smallest = hypersf

            if hypersf < P_CUTOFF:
                if hypersf <= 0: hypersf = P_CUTOFF
                clu_sig_score += -math.log10(hypersf)

        clu_sig_score_p = 0
        clu_sig_smallest_p = -1;
        for bic in biclusters_p:
    for index,clu in enumerate(clusters):
        if len(clu) > 1:
            for reg_name in regulon.keys():
                reg = regulon[reg_name]
                clu_size = len(clu)
                reg_size = len(reg)
                overlap  = len(clu & reg)
                union    = len(clu | reg)

                coe1 = overlap_coe1(overlap, clu_size, reg_size)
                coe2 = overlap/float(union)

                if coe1 < 0.1 or coe2 < 0.1: continue

                rv = hypergeom(OPERON_COUNT, reg_size, clu_size)

                print("%s\t%s\t%i\t%i\t%i\t%.3f\t%.3f\t%g" %
                        (index + 1,