def _jd_at_once(n1,k1,n2,k2,n3): if (n1,k1,n2,k2,n3) in joint_cache: return joint_cache[(n1,k1,n2,k2,n3)] hg1 = spst.hypergeom(n1,k1,n3) hg2 = spst.hypergeom(n2,k2,n3) range1 = np.arange(0,min(n3+1,k1+1)) range2 = np.arange(0,min(n3+1,k2+1)) #print range1,range2 if k1 == 0: pmf1 = np.zeros(len(range1)) pmf1[0] = 1.0 else: pmf1 = hg1.pmf(range1) if k2 == 0: pmf2 = np.zeros(len(range2)) pmf2[0] = 1.0 else: pmf2 = hg2.pmf(range2) #print pmf1,pmf2 jpmf = np.outer(pmf1,pmf2) mrange1 = np.minimum.outer(range1,range2) mrange0 = np.minimum.outer(n3-range1,n3-range2) #print mrange1 #print mrange0 no_ops = np.logical_and(mrange1==0,mrange0==0) #print no_ops jpmf[no_ops] = 0.0 jpmf/=np.sum(jpmf) joint_cache[(n1,k1,n2,k2,n3)] = (jpmf,mrange1,mrange0) return jpmf, mrange1, mrange0
def coco_stats(): """ http://stattrek.com/online-calculator/hypergeometric.aspx CommandLine: python -m mtgmonte.stats --exec-coco_stats --show Example: >>> # DISABLE_DOCTEST >>> from mtgmonte.stats import * # NOQA >>> result = coco_stats() >>> print(result) >>> ut.show_if_requested() """ import plottool as pt from scipy.stats import hypergeom N = pop_size = 60 # cards in deck # NOQA K = num_success = 21 # number of creatures in deck # NOQA n = sample_size = 6 # cards seen by coco # NOQA # prob of at least that many hits hypergeom prb = hypergeom(N, K, n) k = number_of_success = 1 # number of hits you want # NOQA prb.pmf(k) # P(X = k) # prb.cdf(k) # P(X <= k) 1 - prb.cdf(k) # P(X > k) (1 - prb.cdf(k)) + prb.pmf(k) # P(X >= k) def prob_ge(k, prb=prb): return (1 - prb.cdf(k)) + prb.pmf(k) # P(X >= k) pt.ensure_pylab_qt4() import numpy as np k = np.arange(1, 3) K_list = np.arange(15, 30) label_list = [str(K_) + " creatures in deck" for K_ in K_list] ydata_list = [prob_ge(k, prb=hypergeom(N, K_, n)) for K_ in K_list] pt.multi_plot( k, ydata_list, label_list=label_list, title="probability of at least k hits with coco", xlabel="k", ylabel="prob", num_xticks=len(k), use_darkbackground=True, )
def fitness_group(self, x, i, j, *args): """ In a population of x i-strategists and (Z-x) j strategists, where players interact in group of 'group_size' participants this function returns the average payoff of strategies i and j. Parameters ---------- x : int number of individuals adopting strategy i in the population i : int index of strategy i j : int index of strategy j args : List Other Parameters. This can be used to pass extra parameters to functions stored in the payoff matrix Returns ------- int Returns the difference in fitness between strategy i and j """ k_array = np.arange(0, self.N, dtype=np.int32) i_pmf = hypergeom(self.Z-1, x-1, self.N-1).pmf(k_array) j_pmf = hypergeom(self.Z-1, x, self.N-1).pmf(k_array) fitness_i, fitness_j = 0, 0 for k in k_array: fitness_i += self.payoffs[i, j](k + 1, self.N, *args)*i_pmf[k] fitness_j += self.payoffs[j, i](self.N - k, self.N, *args)*j_pmf[k] return fitness_i - fitness_j
def plot_hypergeom(M, N, n): x1 = range(min(n, N) + 1) x2 = range(n + 1) plt.plot(x1, hypergeom(M=M, n=n, N=N).pmf(x1), alpha=0.6, color='gray') plt.plot(x2, hypergeom(M=M, n=n, N=N).pmf(x2), 'o', label='$n={0},N={1},M={2}$'.format(N, M, n))
def test_entropy(self): # Simple tests of entropy. hg = stats.hypergeom(4, 1, 1) h = hg.entropy() expected_p = np.array([0.75, 0.25]) expected_h = -np.sum(xlogy(expected_p, expected_p)) assert_allclose(h, expected_h) hg = stats.hypergeom(1, 1, 1) h = hg.entropy() assert_equal(h, 0.0)
def land_stats(): """ http://stattrek.com/online-calculator/hypergeometric.aspx CommandLine: python -m mtgmonte.stats --exec-land_stats --show Example: >>> # DISABLE_DOCTEST >>> from mtgmonte.stats import * # NOQA >>> result = land_stats() >>> print(result) >>> ut.show_if_requested() """ import plottool as pt from scipy.stats import hypergeom N = pop_size = 60 # cards in deck # NOQA # K = num_success = 25 # lands in deck # NOQA n = sample_size = 6 # cards seen by coco # NOQA # prob of at least that many hits def prob_ge(k, prb): return (1 - prb.cdf(k)) + prb.pmf(k) # P(X >= k) pt.ensure_pylab_qt4() N = deck_size = 60 # NOQA land_range = (24, 27 + 1) # N = deck_size = 40 # NOQA # land_range = (15, 18 + 1) xdata = range(0, 15) # turn ydata_list = [[hypergeom(N, K, x + 7).expect() for x in xdata] for K in range(*land_range)] spread_list = [[hypergeom(N, K, x + 7).std() for x in xdata] for K in range(*land_range)] # spread_list = None import numpy as np label_list = ["%d lands" % (K,) for K in range(*land_range)] pt.multi_plot( xdata, ydata_list, spread_list=spread_list, label_list=label_list, num_xticks=15, num_yticks=13, fnum=1 ) min_lands_acceptable = np.minimum(np.array(xdata), [1, 2, 3, 4, 5, 6] + [6] * (len(xdata) - 6)) pt.multi_plot( xdata, [min_lands_acceptable, (np.array(xdata) ** 0.9) * 0.5 + 4], label_list=["minimum ok", "maximum ok"], num_xticks=15, num_yticks=13, fnum=1, marker="o", )
def prob_nohave_card_always_mulled(copies=2, hand_size=3): # probability of getting the card initially p_none_premul = hypergeom(deck_size, copies, hand_size).cdf(0) # probability of getting the card if everything is thrown away # (TODO: factor in the probability that you need to keep something) # for now its fine because if we keep shadowform the end calculation is fine p_nohave_postmul_given_nohave = hypergeom(deck_size - hand_size, copies, hand_size).cdf(0) # not necessary, but it shows the theory p_nohave_postmul_given_had = 1 p_nohave_turn0 = ( p_nohave_postmul_given_nohave * p_none_premul + (1 - p_none_premul) * p_nohave_postmul_given_had ) return p_nohave_turn0
def fisher_exact(table, side="two.sided", zero_correction=True): """Computes fisher exact odds ratio. Output is almost exactly the same as scipy.stats.fisher_exact but here allows for using Haldane–Anscombe correction (substitutes 0.5 for 0 values in the table, whereas the scipy.stats version and R version fisher.test use integers only). For 95% confidence interval, uses confidence intervals computed by R function fisher.test """ if side not in ("greater", "less", "two.sided"): raise ValueError( "side parameter must be one of 'greater', 'less', or 'two.sided'") # Compute the p value # For all possible contingency tables with the observed marginals, compute the hypergeom # pmf of that table. Sum the p of all tables with p less than or equal to the hypergeom # probability of the observed table. N = np.sum(table) K = np.sum(table[:, 0]) n = np.sum(table[0]) odds_ratio, se = _odds_ratio(table, zero_correction=zero_correction) a_min = np.max([0, table[0][0] - table[1][1]]) a_max = np.min([K, n]) p_observed = hypergeom(N, K, n).pmf(table[0][0]) p_value = 0.0 for a in np.arange(a_min, a_max + 1): possible_table = np.array([[a, n - a], [K - a, N - n - K + a]]) p = hypergeom(N, K, n).pmf(a) if side == "greater": if _odds_ratio(possible_table)[0] >= odds_ratio: p_value += p elif side == "less": if _odds_ratio(possible_table)[0] <= odds_ratio: p_value += p elif side == "two.sided": if p <= p_observed: p_value += p if side == "greater": interval95 = [np.exp(np.log(odds_ratio) - (1.645 * se)), np.inf] elif side == "less": interval95 = [0, np.exp(np.log(odds_ratio) + (1.645 * se))] elif side == "two.sided": interval95 = [ np.exp(np.log(odds_ratio) - (1.96 * se)), np.exp(np.log(odds_ratio) + (1.96 * se)) ] return odds_ratio, np.array(interval95), p_value
def prob_nohave_card_never_mulled(copies=2, hand_size=3): deck_size = 30 prb = hypergeom(deck_size, copies, hand_size) # P(initial_miss) p_none_premul = prb.cdf(0) # GIVEN that we mul our first 3 what is prob we still are unlucky # P(miss_turn0 | initial_miss) prb = hypergeom(deck_size - hand_size, copies, hand_size) p_none_in_mul = prb.cdf(0) # TODO: add constraints about 2 drops # P(miss_turn0) = P(miss_turn0 | initial_miss) * P(initial_miss) p_none_at_start = p_none_in_mul * p_none_premul return p_none_at_start
def Exp1(A, m, r, k): """ Compute the expected number buckets that has collision less or equal 10 by applying approximation1 (Usually apply this approximation when A>500) :param A: number of all patients :param m: number of buckets :param r: ratio of umber of patients satisfying certain criteria t0number of all patients :param k: K in K-anonymity :return: Expectation by applying approximation1 """ B = int(A * r) # number of patients satisfying certain criteria expectation = Decimal(0) alpha = 1 - 1 / (2 * m) # Restrit an interval for single bucket size (|A1| in formula) with probability greater than 1-alpha rv_a = binom(A, 1 / m) (lb_a, ub_a) = rv_a.interval(alpha) rv_b = hypergeom(A, int(lb_a), B) (lb_b, ub_b) = rv_b.interval(alpha) # Rule out the case that there is no collision if lb_b == 0 or lb_a == 0: for a in range(int(lb_a), int(ub_a) + 1): if a > k: # Find lowerbound and upperbound for B1 rv_b = hypergeom(A, a, B) (lb_b, ub_b) = rv_b.interval(alpha) # Rule out the case that there is no collision lb_b = max(1, lb_b) # Compute P(|e| < k | |A1|) p = P(lb_b, ub_b, k, rv_b, a) #Compute Expectation expectation = expectation + p * Decimal(rv_a.pmf(a)) else: rv_b = hypergeom(A, a, B) expectation = expectation + Decimal( rv_a.pmf(a)) * (1 - Decimal(rv_b.pmf(0))) else: for a in range(int(lb_a), int(ub_a) + 1): # when |A1| < k, P(|e| <= k | A1,B1) = 0 if a > k: # Restrit an interval for B1 with probability greater than 0.99995 rv_b = hypergeom(A, a, B) (lb_b, ub_b) = rv_b.interval(0.99995) # Compute P(|e|<=k | |A1|) p = P(lb_b, ub_b, k, rv_b, a) #Compute Expectation expectation = expectation + p * Decimal(rv_a.pmf(a)) else: expectation = expectation + Decimal(rv_a.pmf(a)) return round(expectation * m, 5)
def add_counting_bound_constraints_1(self): """Adds counting bound, for a given number of cliques zonked. """ # FIXME this is half-baked # the probability of some number of cliques being zonked # loop through the number of cliques left over for j in range(self.max_cliques_remaining + 1): # for i in range(self.max_cliques_zeroed+1): # bounds on number of cliques containing edge e # (these won't actually be zeroed) min_cliques_zeroed = max(0, num_cliques - self.max_cliques_remaining) max_cliques_zeroed = min(num_cliques, self.max_cliques_zeroed) # the probability of some number of cliques containing edge e h = hypergeom( # number of possible cliques self.max_cliques, # number of those present num_cliques, # number of cliques which could intersect edge e max_cliques_zeroed) # here, z is the number of cliques which _do_ intersect edge e A = [((z, num_cliques - z), h.pmf(z)) for z in range(min_cliques_zeroed, max_cliques_zeroed + 1)] # the bound is half the number of functions b = (comb(self.max_cliques, num_cliques, exact=True) - 1) / 2 self.add_constraint(A, b)
def add_total_cliques_counting_bound_constraints(self): """Adds counting bound, based on total number of cliques. For each "level" of "total number of cliques found", this adds a bound, based on the counting bound. """ # loop through the number of cliques for num_cliques in range(self.max_cliques + 1): # bounds on number of cliques containing edge e # (these won't actually be zeroed) min_cliques_zeroed = max(0, num_cliques - self.max_cliques_remaining) max_cliques_zeroed = min(num_cliques, self.max_cliques_zeroed) # the probability of some number of cliques containing edge e h = hypergeom( # number of possible cliques self.max_cliques, # number of those present num_cliques, # number of cliques which could intersect edge e max_cliques_zeroed) # here, z is the number of cliques which _do_ intersect edge e A = [((z, num_cliques - z), h.pmf(z)) for z in range(min_cliques_zeroed, max_cliques_zeroed + 1)] # the bound is half the number of functions b = (comb(self.max_cliques, num_cliques, exact=True) - 1) / 2 self.add_constraint(A, b)
def test_discrete_induced_sampling(self): nmasses1 = 10 mass_locations1 = np.geomspace(1.0, 512.0, num=nmasses1) #mass_locations1 = np.arange(0,nmasses1) masses1 = np.ones(nmasses1, dtype=float) / nmasses1 var1 = float_rv_discrete(name='float_rv_discrete', values=(mass_locations1, masses1))() nmasses2 = 10 mass_locations2 = np.arange(0, nmasses2) # if increase from 16 unmodififed becomes ill conditioned masses2 = np.geomspace(1.0, 16.0, num=nmasses2) #masses2 = np.ones(nmasses2,dtype=float)/nmasses2 masses2 /= masses2.sum() var2 = float_rv_discrete(name='float_rv_discrete', values=(mass_locations2, masses2))() self.help_discrete_induced_sampling(var1, var2, 30) num_type1, num_type2, num_trials = [10, 10, 9] var1 = stats.hypergeom(num_type1 + num_type2, num_type1, num_trials) var2 = var1 self.help_discrete_induced_sampling(var1, var2, 300) num_type1, num_type2, num_trials = [10, 10, 9] var1 = stats.binom(10, 0.5) var2 = var1 self.help_discrete_induced_sampling(var1, var2, 300) N = 10 xk, pk = np.arange(N), np.ones(N) / N var1 = float_rv_discrete(name='discrete_chebyshev', values=(xk, pk))() var2 = var1 self.help_discrete_induced_sampling(var1, var2, 30)
def _calc_score( fore_hit_size, fore_size, back_hit_size, back_size, prob_fn=None, ): if prob_fn is None: prob_fn = 'hypergeom' assert prob_fn in ['hypergeom', 'binom'] if back_hit_size <= 0: return 0 k = fore_hit_size n = fore_size K = back_hit_size N = back_size p = K / N if prob_fn == 'hypergeom': binomial = stats.hypergeom(N, K, n) else: binomial = stats.binom(n, p) pr_gt_k = binomial.sf(k - 1) pr_lt_k = binomial.cdf(k) if pr_lt_k <= 0: return -200 elif pr_gt_k <= 0: return 200 else: return -np.log10(pr_gt_k / pr_lt_k)
def get_enrichment_score(self, query_id_set_n, M, overlap_n): # overlap = query_id_set.set & self.set # k = len(overlap) pv = hypergeom(M, self.n, query_id_set_n).sf(overlap_n) print "m=" +str(M) + " n=" + str(self.n) + " q=" + str(query_id_set_n) + " k=" + str(overlap_n) + " pv=" + str(pv) return pv # EnrichmentScore(pv, k, overlap, self.name)
def run(domain_name='X', projection_name='Y8'): prob2 = sio.loadmat('prob2.mat') domain_names = ['X', 'XV', 'Y', 'Y8', 'Y12'] domains = [prob2.get(d) for d in domain_names] #domain_clusters = [prob2.get('ids_' + d) for d in domain_names] tissue_clusters = prob2.get('tissue_category') clusters = domain_clusters[domain_names.index(domain_name)] pdom = domains[domain_names.index(projection_name)] cdom = domains[domain_names.index(domain_name)] f = plt.figure(1) f.clear() random.seed(1) ct = array(mc.getct(218)) #px, py = 2, 2 sstrings = ['21{0:d}'.format(i + 1) for i in range(4)] inds = arange(shape(dom)[1]) c_inds = array(clusters).flatten() - 1 tc_inds = tissue_clusters.flatten() - 1 colors = ct[c_inds, :] ax = f.add_subplot(sstrings[0], title = \ 'Clusters from genespace affinity. Projection to first two elements') ax.scatter(*cdom[inds, 0:2].T, s=100, c=colors) ax = f.add_subplot(sstrings[1], title = \ 'Clusters from genespace affinity. Projection to MVE') ax.scatter(*pdom[inds, 0:2].T, s=100, c=colors) cpairs = set([ '{0:d}x{1:d}'.format(ix, iy) for ix, x in enumerate(c_inds) for iy, y in enumerate(c_inds) if ix < iy and x == y ]) tcpairs = set([ '{0:d}x{1:d}'.format(ix, iy) for ix, x in enumerate(tc_inds) for iy, y in enumerate(tc_inds) if ix < iy and x == y ]) f.savefig('figs/cluster_projectsions.tiff', format='tiff') max_pairs = (len(tc_inds) * len(tc_inds) - len(tc_inds)) / 2 total_pairs = len(cpairs.union(tcpairs)) shared_pairs = len(cpairs.intersection(tcpairs)) print 'using affinity propagation with affinites over domain {0}'.format( domain_name) print 'found' print ' max pairs: {0}'.format(max_pairs) print ' total pairs: {0}'.format(total_pairs) print ' tissue pairs: {0}'.format(len(tcpairs)) print ' cluster pairs: {0}'.format(len(cpairs)) print ' shared pairs: {0}'.format(shared_pairs) hg = hypergeom(len(tcpairs), len(cpairs), max_pairs) return hg
def test_get_univariate_leja_rule_bounded_discrete(self): growth_rule = partial(constant_increment_growth_rule, 2) level = 3 nmasses = 20 xk = np.array(range(0, nmasses), dtype='float') pk = np.ones(nmasses) / nmasses var_cheb = float_rv_discrete(name='discrete_chebyshev', values=(xk, pk))() for variable in [ var_cheb, stats.binom(17, 0.5), stats.hypergeom(10 + 10, 10, 9) ]: quad_rule = get_univariate_leja_quadrature_rule( variable, growth_rule) x, w = quad_rule(level) loc, scale = transform_scale_parameters(variable) x = x * scale + loc xk, pk = get_probability_masses(variable) print(x, xk, loc, scale) degree = (x.shape[0] - 1) true_moment = (xk**degree).dot(pk) moment = (x**degree).dot(w[-1]) print(moment, true_moment, variable.dist.name) assert np.allclose(moment, true_moment)
def hypergeom_p_values(data, selected, callback=None): """ Calculates p_values using Hypergeometric distribution for two numpy arrays. Works on a matrices containing zeros and ones. All other values are truncated to zeros and ones. :param data: all examples in rows, theirs features in columns :type data: numpy.array :param selected: selected examples in rows, theirs features in columns :type selected: numpy.array :return: p-values for features """ if data.shape[1] != selected.shape[1]: raise ValueError("Number of columns does not match.") # clip values to a binary variables data = data > 0 selected = selected > 0 num_features = selected.shape[1] pop_size = data.shape[0] # population size = number of all data examples sam_size = selected.shape[0] # sample size = number of selected examples pop_counts = np.sum(data, axis=0) # number of observations in population = occurrences of words all data sam_counts = np.sum(selected, axis=0) # number of observations in sample = occurrences of words in selected data step = 250 p_vals = [] for i, (pc, sc) in enumerate(zip(pop_counts, sam_counts)): hyper = stats.hypergeom(pop_size, pc, sam_size) # since p-value is probability of equal to or "more extreme" than what was actually observed # we calculate it as 1 - cdf(sc-1). sf is survival function defined as 1-cdf. p_vals.append(hyper.sf(sc-1)) if callback and i % step == 0: callback(100*i/num_features) return p_vals
def hyper(N,M,n,m): ''' Function defines the parameters for a hypergeometric test that returns a p-value representing the chances of identifying >= x, where x is the number of successes ''' frozendist=hypergeom(N,M,n) ms=np.arange(m, min(n+1, M+1)) rv=0; for single_m in ms: rv=rv+frozendist.pmf(single_m) return rv
def pvalue(N, M, n, m): N = deepcopy(N) M = deepcopy(M) n = deepcopy(n) m = deepcopy(m) maxlen = max([length(N), length(M), length(n), length(m)]) if maxlen > 1: if length(N) == 1: N = [N for i in range(maxlen)] elif length(N) != maxlen: raise ValueError('Inequally long vectors have been provided to this function') if length(M) == 1: M = [M for i in range(maxlen)] elif length(M) != maxlen: raise ValueError('Inequally long vectors have been provided to this function') if length(n) == 1: n = [n for i in range(maxlen)] elif length(n) != maxlen: raise ValueError('Inequally long vectors have been provided to this function') if length(m) == 1: m = [m for i in range(maxlen)] elif length(m) != maxlen: raise ValueError('Inequally long vectors have been provided to this function') return [pvalue(N[i],M[i],n[i],m[i]) for i in range(maxlen)] else: hg = sps.hypergeom(N, M, n) if m > M or m > n: m = min(M, n) return sum(hg.pmf(np.arange(m, min(M + 1, n + 1))))
def compute_clusters_ps(predicted_clusters, goa_clusters): predicted_clusters = { a: p for a, p in predicted_clusters.items() if len(p) >= 3 } goa_clusters = {a: p for a, p in goa_clusters.items() if len(p) >= 3} n_total_proteins = sum(len(p) for p in goa_clusters.values()) top_p_values = {} for predict_cluster, predict_proteins in tqdm(predicted_clusters.items()): p_value = float('inf') top_goa_cluster = None for goa_cluster, goa_proteins in goa_clusters.items(): n_goa_proteins = len(goa_proteins) n_predicted_proteins = len(predict_proteins) n_proteins_from_goa = len( goa_proteins.intersection(predict_proteins)) goa_c_p_value = ss.hypergeom( n_total_proteins, n_goa_proteins, n_predicted_proteins).sf(n_proteins_from_goa - 1) if goa_c_p_value < p_value: p_value = goa_c_p_value top_goa_cluster = goa_cluster top_p_values[predict_cluster] = (top_goa_cluster, p_value) return top_p_values
def add_total_cliques_equality_constraints(self): """Adds constraints for a given total number of cliques. For 0 <= m <= N, these define a variable '(total_cliques, m)', which is E[ number of gates need to find m cliques ], or "the expected number of gates needed at 'level m'". It's constrained to equal the weighted average of FIXME describe this. """ # loop through the number of cliques for num_cliques in range(self.max_cliques + 1): # bounds on number of cliques containing edge e # (these won't actually be zeroed) min_cliques_zeroed = max(0, num_cliques - self.max_cliques_remaining) max_cliques_zeroed = min(num_cliques, self.max_cliques_zeroed) # the probability of some number of cliques containing edge e h = hypergeom( # number of possible cliques self.max_cliques, # number of those present num_cliques, # number of cliques which could intersect edge e max_cliques_zeroed) # here, z is the number of cliques which _do_ intersect edge e A = [((z, num_cliques - z), h.pmf(z)) for z in range(min_cliques_zeroed, max_cliques_zeroed + 1)] # this is constraining the total number of gates at this "level" # to equal the average, weighted by the probability of some # number of cliques being zeroed out self.add_constraint(A + [(('total_cliques', num_cliques), -1.0)], 0, True)
def get_enriched(all_genes, selection, name, method, cutoff, print_all): """Get enrichment for pfam domains.""" all_counts = Counter(all_genes) sel_counts = Counter(selection) df = pd.DataFrame({ "all": pd.Series(all_counts), name: pd.Series(sel_counts) }).fillna(0) # Hypergeometric test M = df["all"].sum() N = df[name].sum() df["p_value"] = df.apply( lambda x: hypergeom(M, x["all"], N).sf(x[name] - 1), axis=1) # Multiple test correction corr = "fdr_bh" if method == "bh" else method df[corr] = multipletests(df["p_value"], method=corr)[1] df = df.sort_values(corr) df["significant"] = df[corr] <= cutoff # Add pfam domain and description columns df = df.reset_index().rename(columns={"index": "pfam_domain"}) if not print_all: df = df.loc[df["significant"]] df.insert(1, "description", df["pfam_domain"].map(get_pfam_desc)) return df
def find_hypergeometric(genes, pred_no_training): overlap = list(set(genes) & set(pred_no_training)) M = 10683 #M=20000 N = len(genes) n = len(pred_no_training) x = len(overlap) pval = hypergeom.sf(x - 1, M, n, N) rv = hypergeom(M, n, N) distr = np.arange(0, n + 1) #print (N, n, x) prob = rv.pmf(distr) maximum = np.max(prob) result = np.where(prob == maximum) #print (result) #result=result.tolist() result = result[0] #print (result) fold = x / result fold = fold.tolist() print('Fold Enrichment', fold) print('hypergeometric p-value', pval) return fold
def hypergeometric_test(X, cluster, treshold): # type: (np.ndarray, np.ndarray, float) -> np.ndarray scores = np.zeros((X.shape[1],)) # Binary expression matrix Y = (X >= treshold).astype(int) # Process each gene for gi, g in enumerate(Y.T): # Test parameters M = X.shape[0] # Number of cells n = g.sum() # Number of cells expressing g N = len(cluster) # Number of cells belonging to cluster(s) hg = hypergeom(M, n, N) # Test for over expression x = g[cluster].sum() x_over = np.arange(x, n + 1) # x or more pvalue_over = hg.pmf(x_over).sum() # Test for under expression x_under = np.arange(0, x + 1) # x or less pvalue_under = hg.pmf(x_under).sum() # Proposed scoring: p = min(pvalue_under, pvalue_over) s = -1 if pvalue_under < pvalue_over else 1 score = -np.log(p) * s scores[gi] = score return scores
def test_get_univariate_leja_rule_bounded_discrete(self): from scipy import stats growth_rule = partial(constant_increment_growth_rule, 2) level = 3 nmasses = 20 xk = np.array(range(0, nmasses), dtype='float') pk = np.ones(nmasses) / nmasses var_cheb = float_rv_discrete(name='discrete_chebyshev', values=(xk, pk))() for variable in [ var_cheb, stats.binom(20, 0.5), stats.hypergeom(10 + 10, 10, 9) ]: quad_rule = get_univariate_leja_quadrature_rule( variable, growth_rule) # polys of binom, hypergeometric have no canonical domain [-1,1] x, w = quad_rule(level) from pyapprox.variables import get_probability_masses xk, pk = get_probability_masses(variable) true_moment = (xk**(x.shape[0] - 1)).dot(pk) moment = (x**(x.shape[0] - 1)).dot(w[-1]) assert np.allclose(moment, true_moment)
def family_hg(cluster_p_val_dict, mol_families, p_thresh=0.01): # takes as input a dictionary that maps clusters to their p-vals # the key is a cluster id, and the value is another dictionary # that should have a pval field from scipy.stats import hypergeom import numpy as np # compute the hypergeometric business fam_clust_sig = [] for mf in mol_families: local_n_sig = 0 n_clu = 0 for c in mf.clusters: if c.cluster_id in cluster_p_val_dict: n_clu += 1 if cluster_p_val_dict[c.cluster_id]['pval'] <= p_thresh: local_n_sig += 1 fam_clust_sig.append((mf, n_clu, local_n_sig)) N = len(cluster_p_val_dict) pvallist = [] for c in cluster_p_val_dict: pvallist.append(cluster_p_val_dict[c]['pval']) n_sig = len(list(filter(lambda x: x <= p_thresh, pvallist))) fam_clust_sig_hyp = [] for fam, n_clu, local_n_sig in fam_clust_sig: rv = hypergeom(N, n_sig, n_clu) poss = np.arange(local_n_sig, n_clu + 1) hypp = rv.pmf(poss).sum() fam_clust_sig_hyp.append((fam, hypp, n_clu, local_n_sig)) fam_clust_sig_hyp.sort(key=lambda x: x[1]) return fam_clust_sig_hyp
def generate_scores(_ids, _scores, _spectra, _kernel, _params): res = _params['fragment mass tolerance'] sfactor = 20 sadjust = 1 if res > 100: sfactor = 40 sd = {} for j in _ids: p_score = 0.0 if not _ids[j]: continue for i in _ids[j]: kern = _kernel[i] lseq = list(kern['seq']) pmass = int(kern['pm'] / 1000) cells = int(pmass - 200) if cells > 1500: cells = 1500 total_ions = 2 * (len(lseq) - 1) if total_ions > sfactor: total_ions = sfactor if total_ions < _scores[j]: total_ions = _scores[j] + 1 sc = len(_spectra[j]['sms']) / 3 if _scores[j] >= sc: sc = _scores[j] + 2 rv = hypergeom(cells, total_ions, sc) p = rv.pmf(_scores[j]) pscore = -100.0 * math.log10(p) * sadjust sd[(j, i)] = pscore return sd
def _calc_score( fore_hit_size, fore_size, back_hit_size, back_size, prob_fn=None, ): if prob_fn is None: prob_fn = "hypergeom" assert prob_fn in ["hypergeom", "binom"] if back_hit_size <= 0: return 0 k = fore_hit_size n = fore_size K = back_hit_size N = back_size p = K / N if prob_fn == "hypergeom": binomial = stats.hypergeom(N, K, n) else: binomial = stats.binom(n, p) pr_gt_k = binomial.sf(k - 1) pr_lt_k = binomial.cdf(k) if pr_lt_k <= 0: return -200 elif pr_gt_k <= 0: return 200 else: return -np.log10(pr_gt_k / pr_lt_k)
def chug_count_distribution(cls, player_count): # Exact probability N = 13 * player_count K = player_count n = 13 rv = hypergeom(N, K, n) return rv.pmf, f"HyperGeometric({N}, {K}, {n})"
def __init__(self, M, n, N, interval_shift=0, order=2, type='aleatory', name='', number=0): if M < 0: raise VariableInputError( 'HypergeometricVariable M must be greater or equal to 0.') if n < 0: raise VariableInputError( 'HypergeometricVariable n must be greater or equal to 0.') if (N < 1) or (N > (M + n)): raise VariableInputError( 'HypergeometricVariable M must be greater than 1 and less than M+n.' ) self.interval_shift = interval_shift self.order = order self.M = M self.n = n self.N = N self.type = UncertaintyType.from_name(type) self.name = f'x{number}' if name == '' else name self.var_str = f'x{number}' self.x = symbols(self.var_str) self.distribution = Distribution.HYPERGEOMETRIC self.dist = hypergeom(M=self.M + self.n, n=self.n, N=self.N, loc=self.interval_shift) self.find_high_lim() self.get_probability_density_func() self.check_num_string() self.recursive_var_basis(self.x_values, self.probabilities, self.order) self.create_norm_sq(self.x_values, self.probabilities) self.low_approx = np.min(self.x_values) self.high_approx = np.max(self.x_values) self.std_bounds = (self.low_approx, self.high_approx) self.check_bounds() if self.type == UncertaintyType.EPISTEMIC: warn('The HypergeometricVariable is usually not epistemic. For an ' 'epistemic variable, consider using the continuous uniform ' 'distribution with type epistemic.') showwarning = _warn
def calc_pvalue(gene_list, gene_set, M): gene_list = set(gene_list) gene_set = set(gene_set) N = len(gene_list) n = len(gene_set) overlap = gene_list & gene_set k = len(overlap) return hypergeom(M, n, N).sf(k), list(overlap)
def test_hypergeometric(self): N = 20 K = 3 p = hypergeometric(N, K) scipy_p = np.array( [hypergeom(N, K, n).pmf(range(0, K + 1)) for n in range(N + 1)]) err = np.abs(p - scipy_p).max() self.assertTrue(err < 1e-10)
def calc_enrichment_score(n, o, M, N): # M = number of strains screened # n = number of screened strains with attribute # N = number of active strains # o = number of active strains with attribute rv = hypergeom(M, n, N) p_val = rv.sf(o - 1) return p_val
def variance_function(theta): rounded_m_theta = round(theta * M) TP_rv = hypergeom(M=M, n=P, N=round(theta * M)) return sum([ TP_rv.pmf(x) * (given_x_function(x, theta)**2) for x in range(int(max(0, rounded_m_theta - N)), int(min((P + 1, rounded_m_theta + 1)))) ])
def calc_pvalue(query_id_set, reference_id_set, M): query_id_set = set(query_id_set) reference_id_set = set(reference_id_set) N = len(query_id_set) n = len(reference_id_set) overlap = query_id_set & reference_id_set k = len(overlap) return hypergeom(M, n, N).sf(k), list(overlap)
def hypergeometric_cdf(self, N, K, n, k): """ N= total number of genes in population K= number of GOA n= select a sample (top 50, bottom half, etc.) k= number of successes in the sample """ return 1 - hypergeom(N, K, n).cdf(k)
def get_probability_density_func(self): """ Calculates the probabilities for the HypergeomericVariable x_values. """ dist = hypergeom(M=self.M + self.n, n=self.n, N=self.N) self.probabilities = dist.pmf(self.x_values)
def test_prob(num_hits, pop_size, num_draws, num_matching=1): """Perform a hypergeometric test to see the probability of drawing the same entity num_hits many times. Need to check math """ dist = hypergeom(pop_size, num_matching, num_draws) pval = dist.sf(num_hits - 1) return pval
def run( domain_name = 'X', projection_name = 'Y8' ): prob2 = sio.loadmat('prob2.mat') domain_names = ['X', 'XV', 'Y', 'Y8', 'Y12'] domains = [prob2.get(d) for d in domain_names] #domain_clusters = [prob2.get('ids_' + d) for d in domain_names] tissue_clusters = prob2.get('tissue_category') clusters = domain_clusters[domain_names.index(domain_name)] pdom = domains[domain_names.index(projection_name)] cdom = domains[domain_names.index(domain_name)] f = plt.figure(1) f.clear() random.seed(1) ct = array(mc.getct(218)) #px, py = 2, 2 sstrings = ['21{0:d}'.format(i+1) for i in range(4)] inds = arange(shape(dom)[1]) c_inds = array(clusters).flatten() -1 tc_inds = tissue_clusters.flatten() -1 colors = ct[c_inds,:] ax = f.add_subplot(sstrings[0], title = \ 'Clusters from genespace affinity. Projection to first two elements') ax.scatter(*cdom[inds,0:2].T,s= 100, c = colors) ax = f.add_subplot(sstrings[1], title = \ 'Clusters from genespace affinity. Projection to MVE') ax.scatter(*pdom[inds,0:2].T,s= 100, c = colors) cpairs = set(['{0:d}x{1:d}'.format(ix,iy) for ix, x in enumerate(c_inds) for iy, y in enumerate(c_inds) if ix < iy and x == y ]) tcpairs = set(['{0:d}x{1:d}'.format(ix,iy) for ix, x in enumerate(tc_inds) for iy, y in enumerate(tc_inds) if ix < iy and x == y ]) f.savefig('figs/cluster_projectsions.tiff',format = 'tiff') max_pairs =( len(tc_inds) * len(tc_inds) - len(tc_inds)) / 2 total_pairs = len(cpairs.union(tcpairs)) shared_pairs =len(cpairs.intersection(tcpairs)) print 'using affinity propagation with affinites over domain {0}'.format(domain_name) print 'found' print ' max pairs: {0}'.format(max_pairs) print ' total pairs: {0}'.format(total_pairs) print ' tissue pairs: {0}'.format(len(tcpairs)) print ' cluster pairs: {0}'.format(len(cpairs)) print ' shared pairs: {0}'.format(shared_pairs) hg = hypergeom( len(tcpairs), len(cpairs), max_pairs ) return hg
def p_value(num_genes, num_genes_int_top_list, num_top_genes, total_genes=4000): rv = hypergeom(total_genes, num_top_genes, num_genes) p = rv.sf(num_genes_int_top_list - 1) if isnan(p): # old version of hypergeom.sf() gives NaN, yuck p = rv.pmf(range(num_genes_int_top_list, num_genes + 1)).sum() return p
def hyper_test2(X, K, n, N): """ Hypergeometric test for overexpression. Gives the probability that there are X or more events A over n occurences given the total number of event K over the total number of occurences N. For underexpression. Note that this is note 1 - the previous, because we are computing the probability that x is equal or less than X TODO: improve with cdf """ return sum([hypergeom(N, n, K).pmf(x) for x in range(X+1)])
def test_rvs(self): vals = stats.hypergeom.rvs(20, 10, 3, size=(2, 50)) assert numpy.all(vals >= 0) & numpy.all(vals <= 3) assert numpy.shape(vals) == (2, 50) assert vals.dtype.char in typecodes["AllInteger"] val = stats.hypergeom.rvs(20, 3, 10) assert isinstance(val, int) val = stats.hypergeom(20, 3, 10).rvs(3) assert isinstance(val, numpy.ndarray) assert val.dtype.char in typecodes["AllInteger"]
def hyper_test(X, K, n, N): """ Hypergeometric test for overexpression. Gives the probability that there are X or more events A over n occurences given the total number of event K over the total number of occurences N. X: Number of events K: Total number of events n: number of occurences N: total number of occurences TODO: improve with cdf """ return 1. - sum([hypergeom(N, n, K).pmf(x) for x in range(X)])
def baseline(self): """Return the baseline performance vector. The baseline is obtaining OOT posts by chance. Thus, the baseline performance vector is the probability mass function of a hypergeometric random variable denoting the number of OOT posts in the top N list. The k-th element represents the probability of getting k OOT posts in the top N list. """ rv = hypergeom(self.M, self.n, self.N) k = np.arange(self.min_sup, self.max_sup+1) return rv.pmf(k)
def combo_in_top(n): prbA = hypergeom(N, nA, n) prbB = hypergeom(N, nB, n) prbL = hypergeom(N, nLands, n) # cdf is probabiliyt of k or fewer successes # prb.cdf(0) p_L_eq0 = prbL.cdf(0) p_L_le1 = prbL.cdf(1) p_L_le4 = prbL.cdf(4) # having between 2 to 4 lands p_L_ge2_le4 = p_L_le4 - p_L_le1 p_keepable = p_L_ge2_le4 # probability of having none p_A_eq0 = prbA.cdf(0) p_B_eq0 = prbB.cdf(0) # probability of having at least 1 p_A_ge1 = 1 - p_A_eq0 p_B_ge1 = 1 - p_B_eq0 # http://math.stackexchange.com/questions/72589/calculating-probability-of-at-least-one-event-occurring def p_not_any_fail(p_A_fail, p_B_fail): p_and = (1 - p_A_fail) + (1 - p_B_fail) - (1 - p_A_fail * p_B_fail) return p_and p_and = (1 - p_A_eq0) + (1 - p_B_eq0) - (1 - p_A_eq0 * p_B_eq0) p_and = p_A_eq0 * p_B_eq0 - p_A_eq0 - p_B_eq0 + 1 p_nor = p_A_eq0 * p_B_eq0 # chance_of_neither_combo_card p_or = 1 - p_nor # chance of either card p_and = p_A_ge1 + p_B_ge1 - p_or # chance of both cards p_xor = p_or - p_and # change of either A or B but not both print("p_and = %r" % (p_and,)) p_not_any_fail(1 - p_and, 1 - p_keepable)
def hg_p_value(n_parent,k_parent,n_child,k_child): """ one-tailed hypothesis test. H0: The partition is random. """ hg = spst.hypergeom(n_parent,k_parent,n_child) parent_mean = n_child*(k_parent*1.0/n_parent) if k_child <= parent_mean: #then we want to know what the probability is #that we would observe a result as extreme as this one. return max(0.0,hg.cdf(k_child)) else: return max(1-hg.cdf(k_child-1),0.0)
def get_sender_pvals(addrCounts): M = sum([t[0] for t in addrCounts.values()]) N = sum([t[1] for t in addrCounts.values()]) low = [] high = [] for k,t in addrCounts.items(): h = stats.hypergeom(N, M, t[1]) low.append((h.cdf(t[0]), t[0], t[1], k)) p = h.sf(t[0] - 1) if isnan(p): # old version of hypergeom.sf() gives NaN, yuck p = h.pmf(range(t[0], t[1] + 1)).sum() high.append((p, t[0], t[1], k)) low.sort() high.sort() return low, high
def calculate_enrichment(pathway_matrix, gene_set): """Calculate hypergoemotric enrichment of the set for each pathway The pathway matrix should have pathways in rows and genes in columns """ # only consider genes which are known to be in pathways pathway_gene_list = gene_set.intersection(pathway_matrix.columns) # Generate hypergeometric distributions for each pathway. Each # pathway needs its own because they have different lenghts distributions = [hypergeom(len(pathway_matrix.columns), l, len(pathway_gene_list)) for l in pathway_matrix.sum(axis=1)] pathway_hits = pathway_matrix[pathway_gene_list].sum(axis=1) # Each p-value for the hypergeometric enrichment is # survival function + 0.5 * pmf significance = [dist.sf(x) + 0.5 * dist.pmf(x) for x, dist in zip(pathway_hits, distributions)] return Series(significance, index=pathway_matrix.index)
def Hypergeometric(N, n, K, tag=None): """ A Hypergeometric random variate Parameters ---------- N : int The total population size n : int The number of individuals of interest in the population K : int The number of individuals that will be chosen from the population Example ------- (Taken from the wikipedia page) Assume we have an urn with two types of marbles, 45 black ones and 5 white ones. Standing next to the urn, you close your eyes and draw 10 marbles without replacement. What is the probability that exactly 4 of the 10 are white? :: >>> black = 45 >>> white = 5 >>> draw = 10 # Now we create the distribution >>> h = H(black + white, white, draw) # To check the probability, in this case, we can use the underlying # scipy.stats object >>> h.rv.pmf(4) # What is the probability that white count = 4? 0.0039645830580151975 """ assert ( int(N) == N and N > 0 ), 'Hypergeometric total population size "N" must be an integer greater than zero.' assert ( int(n) == n and 0 < n <= N ), 'Hypergeometric interest population size "n" must be an integer greater than zero and no more than the total population size.' assert ( int(K) == K and 0 < K <= N ), 'Hypergeometric chosen population size "K" must be an integer greater than zero and no more than the total population size.' return uv(ss.hypergeom(N, n, K), tag=tag)
def enrichment(dbfx,inp): print("*************************************************************************************************") print("Database :"+str(dbfx)) print("Input file:"+str(inp)) fout = codecs.open("Enrichment_results.csv",'w',encoding = "utf8") fout.write("PATHWAY_NAME\tLENGTH OF PATHWAY\tINPUT_GENESET\tOVERLAPPED_GENESET\tPValue\n") input = [] glst = [] input = set(getinput(inp)) db = {} db,glst = database(dbfx) M = len(glst) for d in db.keys(): overlap = len(input.intersection(set(db[d]))) if overlap > 0: ora = hypergeom(M,len(set(db[d])),len(input)) p = ora.pmf(overlap) # print(str(M)+"\t"+str(len(set(db[d])))+"\t"+len(input)+"\t"+str(overlap)+str(p)+"\n") fout.write(str(d)+"\t"+str(len(set(db[d])))+"\t"+str(len(input))+"\t"+str(overlap)+"\t"+str(p)+"\n")
def partition_htest_value(n_parent,k_parent,n_child,k_child,alpha,cache=False): """ tests a partition of k_parent +1s in n_parent (+1/-1)s. Returns (min,max) which are endpoints of (1-alpha)% confidence interval for H0 = partition is random. """ if n_child == 1: if 1.0*k_parent/n_parent < alpha/2.0 and k_parent == 1: return True elif 1.0*k_parent/n_parent > 1-alpha/2.0 and k_parent == 0: return True else: return False if cache==False: hg = spst.hypergeom(n_parent,k_parent,n_child) c = hg.cdf([k_child-1,k_child]) #okay, so c is the cdf INCLUDING k. #if that's greater than alpha/2, then we throw away the coeff. #if that's less than 1-alpha/2 we don't know. But if the cdf for one #less is less than 1-alpha/2, we throw away the coeff. return not ((c[1] > alpha/2.0) and (c[0] < (1.0-alpha/2.0))) else: hgt = (n_parent,k_parent,n_child,alpha) print hgt if hgt in _h_test_dict: left,right = _h_test_dict[hgt] print "saved one" else: left,right = partition_htest(*hgt) _h_test_dict[hgt] = (left,right) if left == -1 and right == -1: return False elif left == -1 and k_child > right: return True elif k_child < left and right == -1: return True elif k_child < left or k_child > right: return True else: return False
def calculateByDraw(self): # Method to calculate and display the probabilites for the number of successful draws out of a single pool of OUTS # hypergeometric formula: assumes draws are just total # of draws rv = hypergeom(self.remainingDeckSizeSpinBox.value(), self.numberOfOutsInDeckSpinBox.value(), self.numberOfDrawsSpinBox.value()) outs = np.arange(0, self.numberOfOutsInDeckSpinBox.value() + 1) draws = np.arange(0, self.numberOfDrawsSpinBox.value() + 1) PMF = rv.pmf(outs) self.probabilityTable.clear() odds = np.empty(self.numberOfOutsInDeckSpinBox.value() + 1, dtype=float) surviveodds = np.empty(self.numberOfOutsInDeckSpinBox.value()+1, dtype=float) self.probabilityTable.setRowCount(len(outs) + 1) self.probabilityTable.setColumnCount(4) # initialize the table. This code looks terrible i = 0 j = 0 while i < self.probabilityTable.columnCount(): while j < self.probabilityTable.rowCount(): matrixElement = QtGui.QTableWidgetItem() self.probabilityTable.setItem(j, i, matrixElement) j += 1 i += 1 j = 0 item = self.probabilityTable.item(0, 0) item.setText(_translate("MainWindow", "Exactly", None)) item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter) item = self.probabilityTable.item(0, 1) item.setText(_translate("MainWindow", "Probability", None)) item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter) item = self.probabilityTable.item(0, 2) item.setText(_translate("MainWindow", "At Least", None)) item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter) item = self.probabilityTable.item(0, 3) item.setText(_translate("MainWindow", "Probability", None)) item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter) #Loop to build the table and calculate the PMF and SF based on inputs if len(outs)> len(draws): maxSuccess = draws else: maxSuccess = outs for out in maxSuccess: # creating spaces in table matrixElement = QtGui.QTableWidgetItem() self.probabilityTable.setItem(out + 1, 0, matrixElement) matrixElement = QtGui.QTableWidgetItem() self.probabilityTable.setItem(out + 1, 1, matrixElement) # Calculate PMF for each out odds[out] = rv.pmf(out) surviveodds[out] = rv.sf(out) # populate the table # exact outs item = self.probabilityTable.item(out + 1, 0) item.setText(_translate("MainWindow", "{0:d}".format(out), None)) item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter) item = self.probabilityTable.item(out + 1, 1) item.setText(_translate("MainWindow", "{0:3f}".format(odds[out]), None)) item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter) # Atleast outs if out != 0: # atleast zero outs is meaningless item = self.probabilityTable.item(out + 1, 2) item.setText(_translate("MainWindow", "{0:d}".format(out), None)) item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter) item = self.probabilityTable.item(out + 1, 3) item.setText(_translate("MainWindow", "{0:3f}".format(surviveodds[out-1]), None)) item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter) # create the pmf graph) print(surviveodds) print(outs) print("The odds of getting at least {0:d} outs is {1:3f}".format(out, odds[out])) #### Bar graphs if I want to add them later ax = self.figure.add_subplot(111) width = 0.3 ax.bar(outs, PMF, width, color = 'r' ) ax.hold(True) ax.bar(outs+(1.2*width+1), surviveodds, width, color = 'b') ax.set_xticks(outs + width) ax.set_xticklabels(outs) ax.hold(False) self.canvas.draw()
import matplotlib.pyplot as plt from scipy.stats import hypergeom, rv_discrete import numpy as np numargs = hypergeom.numargs #[ M, n, N ] = [100, 10, -1] #Display frozen pmf: rv = hypergeom( 10, 20, 3 ) print rv.dist.b x = np.arange( 0, np.min( rv.dist.b, 3 ) + 1 ) h = plt.plot( x, rv.pmf( x ) ) exit() #Check accuracy of cdf and ppf: prb = hypergeom.cdf( x, M, n, N ) h = plt.semilogy( np.abs( x - hypergeom.ppf( prb, M, n, N ) ) + 1e-20 ) #Random number generation: R = hypergeom.rvs( M, n, N, size=100 ) #Custom made discrete distribution: vals = [np.arange( 7 ), ( 0.1, 0.2, 0.3, 0.1, 0.1, 0.1, 0.1 )] custm = rv_discrete( name='custm', values=vals ) h = plt.plot( vals[0], custm.pmf( vals[0] ) )
def sample(self, x=None): return hypergeom(self.M, self.X, self.m).rvs(x, random_state=self.random)
def calculate_enrichment(N=100): # You need to replace this with something useful experiment_dict = eas.experiment(); counttop100 = {} countbot100 = {} goid_prob_top = {} goid_prob_bot = {} #initialize dictionary of goids with open("go_info.txt", 'r') as target: target.readline(); for line in target: lines = line.split(); counttop100[lines[0]] = np.zeros(32) #goid counts in top 100 countbot100[lines[0]] = np.zeros(32) #goid counts in top 100 goid_prob_top[lines[0]] = np.zeros(32) #goid prob goid_prob_bot[lines[0]] = np.zeros(32) #goid prob #add values for goid for i in range(0, 33): sorted_genes = experiment_dict[i].sort(key=lambda tup: tup[1]) top100 = sorted_genes[0:100] bot100 = sorted_genes[-100:0] #go hrough top 100, want list of goIds, count for top/bot100 for j in range(0, 100): counttop100[ gene_to_go[ top100[j][0] ] ][i] += 1 countbot100[ gene_to_go[ top100[j][0] ] ][i] += 1 #hypergeom, what is the probability i got that many counts from top 100 #top for j in counttop100: gene_per_go = len(go_to_gene[j]) [ M, n, N] = [4767, gene_per_go, N]; rv = scistat.hypergeom(M, n, N) x = np.arange(0, counttop100[j][i] + 1) survival_exp = rv.sf(x) goid_prob_top[j][i] = survival_exp for j in countbot100: gene_per_go = len(go_to_gene[j]) [ M, n, N] = [4767, gene_per_go, N]; rv = scistat.hypergeom(M, n, N) x = np.arange(0, countbot100[j][i] + 1) survival_exp = rv.sf(x) goid_prob_bot[j][i] = survival_exp #for j in experiment_dict[i]: # mainstuff[ gene_to_go[j[0]] ][i] += j[1] #sort by exp values # mainstuff.sort(key=lambda tup: tup[1]) #take top 100 heatmaptop = plt.pcolor(goid_prob_top); heatmapbot = plt.pcolor(goid_prob_bot); plt.show() positive_enrichment_scores = goid_prob_top; negative_enrichment_scores = goid_prob_bot; return positive_enrichment_scores,negative_enrichment_scores
clu_index = 0 for clu in clusters: clu_index += 1 if clu_index > 100: break clu_set = clu clu_sig_score = 0 clu_sig_smallest = -1; for bic in biclusters: bic_size = len(bic) clu_size = len(clu_set) overlap = len(bic & clu_set) hyper = hypergeom(OPR_COUNT, bic_size, clu_size) hypersf = hyper.sf(overlap) #print("%d\t%d\t%d\t%.3f" % (bic_size, clu_size, overlap, hypersf)) if clu_sig_smallest < 0: clu_sig_smallest = hypersf elif clu_sig_smallest > hypersf: clu_sig_smallest = hypersf if hypersf < P_CUTOFF: if hypersf <= 0: hypersf = P_CUTOFF clu_sig_score += -math.log10(hypersf) clu_sig_score_p = 0 clu_sig_smallest_p = -1; for bic in biclusters_p:
print("cluster\tregulon\tclu_size\treg_size\toverlap\toverlap_coe(wiki)\tcoe2\tp-value(hypergeom)") for index,clu in enumerate(clusters): if len(clu) > 1: for reg_name in regulon.keys(): reg = regulon[reg_name] clu_size = len(clu) reg_size = len(reg) overlap = len(clu & reg) union = len(clu | reg) coe1 = overlap_coe1(overlap, clu_size, reg_size) coe2 = overlap/float(union) if coe1 < 0.1 or coe2 < 0.1: continue rv = hypergeom(OPERON_COUNT, reg_size, clu_size) print("%s\t%s\t%i\t%i\t%i\t%.3f\t%.3f\t%g" % (index + 1, reg_name, clu_size, reg_size, overlap, coe1, coe2, rv.sf(overlap)))