def __init__(self,sigLocal,sig0,N0): # Convert significance to p-value pLocal = norm.sf(sigLocal) p0 = norm.sf(sig0) # Get the test statistic value corresponding to the p-value u = chi2.isf(pLocal*2,1) u0 = chi2.isf(p0*2,1) # The main equations N = N0 * exp(-(u-u0)/2.) pGlobal = N + chi2.sf(u,1)/2. # Further info sigGlobal = norm.isf(pGlobal) trialFactor = pGlobal/pLocal self.sigGlobal = sigGlobal self.sigLocal = sigLocal self.sig0 = sig0 self.pGlobal = pGlobal self.pLocal = pLocal self.p0 = p0 self.N0 = N0 self.N = N self.u0 = u0 self.u = u self.trialFactor = trialFactor
def calculate_bayes_error(model): """ Returns the bayes error of the given mixture model. This is calculated by integrating the false class density around the decision boundary of the model. :requires: the model is composed of two gaussian components! :param model: the mixture model of a given gene :type model: sklearn.mixture.GMM :returns: the bayes error of the classifier as a float """ #first we find the intersection point coeffs = model.weights_ mus = [x[0] for x in model.means_] sigmas = [x[0] ** 0.5 for x in model.covars_] r1, r2 = findIntersection(mus[0], sigmas[0], mus[1], sigmas[1]) root = 0 if r1 < max(mus[0], mus[1]) and r1 > min(mus[0], mus[1]): root = r1 else: root = r2 #now that we have the intersectionm we need the CDF/survival function of both plots err = 0 if(root < mus[0]): err += norm.sf(root, loc=mus[1], scale=sigmas[1]) * coeffs[1] err += norm.cdf(root, loc=mus[0], scale=sigmas[0]) * coeffs[0] else: err += norm.sf(root, loc=mus[0], scale=sigmas[0]) * coeffs[0] err += norm.cdf(root, loc=mus[1], scale=sigmas[1]) * coeffs[1] return err #/ (norm.sf(-10000, loc=mus[0], scale=sigmas[0]) + norm.sf(-10000, loc=mus[1], scale=sigmas[1]) - err)
def normal(dist, point, twosided=True, tail='right'): '''hypothesis testing assuming normal distribution Parameters ---------- dist : array-like empirical (null) parameter distribution point : array-like point estimate of parameter twosided : boolean if True, calculates two-sided p-values tail : str or array specify 'left' or 'right', an array of such strings for one-sided tests. 'right' implies a one-sided test that the point estimate is greater than the null Returns ------- pvalue : array-like pvalues, of shape similar to point estimate ''' if twosided: return 2*norm.sf(abs(point)/dist.std(axis=0)) else: left_tail = norm.cdf((point)/dist.std(axis=0)) right_tail = norm.sf((point)/dist.std(axis=0)) pvalue = np.zeros(point.shape) pvalue[tail == 'left'] = left_tail[tail == 'left'] pvalue[tail == 'right'] = right_tail[tail == 'right'] return pvalue
def calculate_rms(psfFlux, psfFluxErr): xObs = psfFlux / psfFluxErr xMean = (1/ (norm.sf(-xObs )*np.sqrt(2*np.pi))) * np.exp(-(xObs**2.0) / 2.0) + xObs delX = xObs - xMean I1 = norm.sf(-xObs) I0bysig2 = 0.5*erf(xObs/np.sqrt(2)) + (1.0/np.sqrt(2*np.pi))*np.exp(-(xObs**2.0) / 2.0)*(2*delX - xObs) + 0.5 + delX*delX*norm.sf(-xObs) xRMS = np.sqrt(I0bysig2 / I1) return xRMS * psfFluxErr
def set_scale_surgauss(max_r, max_w, min_w): "Set the scale factor of the surgauss kernel." A = max_w/norm.sf(0) scale = minimize(lambda x: (A*norm.sf(max_r, scale=x)-min_w)**2, x0=np.array([max_r]), method='BFGS', tol=1e-8, bounds=(0, None)) scale = scale['x'][0] return scale
def convertN(sig,sig0,N0): # Convert significance to p-value p = norm.sf(sig) p0 = norm.sf(sig0) # Get the test statistic value corresponding to the p-value u = chi2.isf(p*2,1) u0 = chi2.isf(p0*2,1) # The main equation N = N0 * exp(-(u-u0)/2.) return N
def stouffer_liptak(pvals, sigma): qvals = norm.isf(pvals).reshape(len(pvals), 1) try: C = np.asmatrix(chol(sigma)).I except np.linalg.linalg.LinAlgError: # for non positive definite matrix default to z-score correction. z, L = np.mean(norm.isf(pvals)), len(pvals) sz = 1.0 / L * np.sqrt(L + 2 * np.tril(sigma, k=-1).sum()) return norm.sf(z / sz) qvals = C * qvals Cp = qvals.sum() / np.sqrt(len(qvals)) return norm.sf(Cp)
def get_score_df(self, correction_method=None): ''' :param correction_method: str or None, correction method from statsmodels.stats.multitest.multipletests 'fdr_bh' is recommended. :return: pd.DataFrame ''' # From https://people.kth.se/~lang/Effect_size.pdf # Shinichi Nakagawa1 and Innes C. Cuthill. 2007. In Biological Reviews 82. X = self._get_X().astype(np.float64) X = X / X.sum(axis=1) cat_X, ncat_X = self._get_cat_and_ncat(X) n1, n2 = float(cat_X.shape[1]), float(ncat_X.shape[1]) n = n1 + n2 m1 = cat_X.mean(axis=0).A1 m2 = ncat_X.mean(axis=0).A1 v1 = cat_X.var(axis=0).A1 v2 = ncat_X.var(axis=0).A1 s_pooled = np.sqrt(((n2 - 1) * v2 + (n1 - 1) * v1) / (n - 2.)) cohens_d = (m1 - m2) / s_pooled cohens_d_se = np.sqrt(((n - 1.) / (n - 3)) * (4. / n) * (1 + np.square(cohens_d))) cohens_d_z = cohens_d / cohens_d_se cohens_d_p = norm.sf(cohens_d_z) hedges_r = cohens_d * (1 - 3. / ((4. * (n - 2)) - 1)) hedges_r_se = np.sqrt(n / (n1 * n2) + np.square(hedges_r) / (n - 2.)) hedges_r_z = hedges_r / hedges_r_se hedges_r_p = norm.sf(hedges_r_z) score_df = pd.DataFrame({ 'cohens_d': cohens_d, 'cohens_d_se': cohens_d_se, 'cohens_d_z': cohens_d_z, 'cohens_d_p': cohens_d_p, 'hedges_r': hedges_r, 'hedges_r_se': hedges_r_se, 'hedges_r_z': hedges_r_z, 'hedges_r_p': hedges_r_p, 'm1': m1, 'm2': m2, }, index=self.corpus_.get_terms()).fillna(0) if correction_method is not None: from statsmodels.stats.multitest import multipletests score_df['hedges_r_p_corr'] = 0.5 for method in ['cohens_d', 'hedges_r']: score_df[method + '_p_corr'] = 0.5 score_df.loc[(score_df['m1'] != 0) | (score_df['m2'] != 0), method + '_p_corr'] = ( multipletests(score_df.loc[(score_df['m1'] != 0) | (score_df['m2'] != 0), method + '_p'], method=correction_method)[1] ) return score_df
def known_stdev(self, alpha, stdev1, stdev2): n1, n2, y1, y2 = self.n1, self.n2, self.y1, self.y2 z0 = (y1 - y2) / (np.sqrt(stdev1 ** 2. / n1 + stdev2 ** 2. / n2)) # hypothesis testing2 H1a = norm.ppf(1 - alpha / 2.) < np.abs(z0) H1b = norm.ppf(1 - alpha) < z0 H1c = norm.ppf(alpha) > z0 # p-value p1a = norm.sf(np.abs(z0)) * 2 p1b = norm.sf(z0) p1c = norm.cdf(z0) c1 = y1 - y2 - norm.ppf(1 - alpha / 2.) * np.sqrt(stdev1 ** 2. / n1 + stdev2 ** 2. / n2) c2 = y1 - y2 + norm.ppf(1 - alpha / 2.) * np.sqrt(stdev1 ** 2. / n1 + stdev2 ** 2. / n2) return H1a, H1b, H1c, p1a, p1b, p1c, (c1, c2)
def fdr_threshold(z_vals, alpha): """ return the Benjamini-Hochberg FDR threshold for the input z_vals Parameters ---------- z_vals: array, a set of z-variates from which the FDR is computed alpha: float, desired FDR control Returns ------- threshold: float, FDR-controling threshold from the Benjamini-Hochberg procedure """ if alpha < 0 or alpha > 1: raise ValueError('alpha should be between 0 and 1') z_vals_ = - np.sort(- z_vals) p_vals = norm.sf(z_vals_) n_samples = len(p_vals) pos = p_vals < alpha * np.linspace( .5 / n_samples, 1 - .5 / n_samples, n_samples) if pos.any(): return (z_vals_[pos][-1] - 1.e-12) else: return np.infty
def test_full_pvals(n=100, p=40, rho=0.3, snr=4): X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho) FS = forward_stepwise(X, y, covariance=sigma**2 * np.identity(n)) from scipy.stats import norm as ndist pval = [] completed_yet = False for i in range(min(n, p)): FS.next() var_select, pval_select = FS.model_pivots(i+1, alternative='twosided', which_var=[FS.variables[-1]], saturated=False, burnin=2000, ndraw=8000)[0] pval_saturated = FS.model_pivots(i+1, alternative='twosided', which_var=[FS.variables[-1]], saturated=True)[0][1] # now, nominal ones LSfunc = np.linalg.pinv(FS.X[:,FS.variables]) Z = np.dot(LSfunc[-1], FS.Y) / (np.linalg.norm(LSfunc[-1]) * sigma) pval_nominal = 2 * ndist.sf(np.fabs(Z)) pval.append((var_select, pval_select, pval_saturated, pval_nominal)) if set(active).issubset(np.array(pval)[:,0]) and not completed_yet: completed_yet = True completion_index = i + 1 return X, y, beta, active, sigma, np.array(pval), completion_index
def moran_KP(w, u, sig2i): """ Calculates Moran-flavoured tests Parameters ---------- w : W PySAL weights instance aligned with y u : array nx1 array of naive residuals sig2i : array nx1 array of individual variance """ try: w = w.sparse except: pass moran_num = np.dot(u.T, (w * u)) E = SP.lil_matrix(w.get_shape()) E.setdiag(sig2i.flat) E = E.asformat('csr') WE = w * E moran_den = np.sqrt(np.sum((WE * WE + (w.T * E) * WE).diagonal())) moran = float(1.0 * moran_num / moran_den) moran = np.array([moran, norm.sf(abs(moran)) * 2.]) return moran
def combine_p_values(the_p_values, method='z', default_quantile=7.): """Combines p-values from repeat measurements into a single p-value. the_p_values: a list of p-values. method: String. 'z'|'fisher'. 'z' for using the weighted z-score. 'fisher' for using fisher's combined probability test. default_quantile: Float. Only used for z method. The quantile to use when the software's normal inverse cdf(p-value) is infinite """ if len(the_p_values) == 1 or sum(the_p_values) == 0: combined_p_value = sum(the_p_values) elif method.lower() == 'z': #combine p-values using weighted z-score. To not deal with inifinite #values replace the_quantiles = [] for the_p in the_p_values: the_quantile = norm.ppf(1.-the_p) if isinf(the_quantile): the_quantile = default_quantile the_quantiles.append(the_quantile) combined_p_value = norm.sf(sum(the_quantiles) / len(the_quantiles)**0.5) elif method.lower() == 'fisher': combined_p_value = 1-chi2.cdf(-2*sum(map(log, the_p_values)), 2*len(the_p_values)) return combined_p_value
def finalizeSampling(self): #from scikits.talkbox.tools.correlations import acorr from pyhrf.stats import acorr if 0 and self.smplHistory is not None: logger.info('Compute autocorrelation of %s samples, shape=%s', self.name, str(self.smplHistory.shape)) trajectory = self.smplHistory[self.samplerEngine.nbSweeps::2] self.autocorrelation = acorr(trajectory) sn = trajectory.shape[0] ** .5 t95 = 1.959963984540054 / sn self.autocorrelation_test = np.zeros(self.autocorrelation.shape, dtype=np.int32) self.autocorrelation_test[np.where(self.autocorrelation > t95)] = 1 self.autocorrelation_test[ np.where(self.autocorrelation < -t95)] = -1 self.autocorrelation_thresh = t95 from scipy.stats import norm self.autocorrelation_pvalue = np.zeros(self.autocorrelation.shape, dtype=np.float32) m_pos = np.where(self.autocorrelation > 0) if len(m_pos[0]) > 0: ac_pos = self.autocorrelation[m_pos] self.autocorrelation_pvalue[m_pos] = norm.sf(ac_pos * sn) m_neg = np.where(self.autocorrelation < 0) if len(m_neg[0]) > 0: ac_neg = self.autocorrelation[m_neg] self.autocorrelation_pvalue[m_neg] = norm.cdf(ac_neg * sn) logger.info('Compute posterior median for %s ', self.name) self.median = np.median(self.smplHistory, axis=0) self.check_final_value()
def fdr(self, theta): """Given a threshold theta, find the estimated FDR Parameters ---------- theta : float or array of shape (n_samples) values to test Returns ------- afp : value of array of shape(n) """ from scipy.stats import norm self.fdrcurve() if np.isscalar(theta): if theta > self.sorted_x[ - 1]: return 0 maj = np.where(self.sorted_x >= theta)[0][0] efp = (self.p0 * norm.sf(theta, self.mu, self.sigma) * self.n\ / np.sum(self.x >= theta)) efp = np.maximum(self.sorted_fdr[maj], efp) else: efp = [] for th in theta: if th > self.sorted_x[ - 1]: efp.append(0) continue maj = self.sorted_fdr[np.where(self.sorted_x >= th)[0][0]] efp.append(np.maximum(maj, self.p0 * st.norm.sf(th, self.mu, self.sigma) * self.n / np.sum(self.x >= th))) efp = np.array(efp) # efp = np.minimum(efp, 1) return efp
def s_to_p(s): """Convert significance to one-sided tail probability. Parameters ---------- s : array_like Significance Returns ------- p : ndarray One-sided tail probability See Also -------- p_to_s, s_to_p_limit Examples -------- >>> s_to_p(0) 0.5 >>> s_to_p(1) 0.15865525393145707 >>> s_to_p(3) 0.0013498980316300933 >>> s_to_p(5) 2.8665157187919328e-07 >>> s_to_p(10) 7.6198530241604696e-24 """ from scipy.stats import norm return norm.sf(s)
def pvalues(self): """ (array) The p-values associated with the z-statistics of the coefficients. Note that the coefficients are assumed to have a Normal distribution. """ return norm.sf(np.abs(self.zvalues)) * 2
def test_z_score(): p = np.random.rand(10) assert_array_almost_equal(norm.sf(z_score(p)), p) # check the numerical precision for p in [1.e-250, 1 - 1.e-16]: assert_array_almost_equal(z_score(p), norm.isf(p)) assert_array_almost_equal(z_score(np.float32(1.e-100)), norm.isf(1.e-300))
def rescore(df, tol_ms1=10, tol_ms2=20): df = df[(df['Precursor m/z Error (ppm)'] < tol_ms1) & (df['Precursor m/z Error (ppm)'] > -tol_ms1)] ppmArray = [] for i, x in df.iterrows(): t = eval(x['Theoretical Products']) m = eval(x['Nearest Matches']) p = [ (e[0] - e[1]) * 1e6 / e[1] for e in zip(m, t)] ppmArray.append(p) df['Nearest Matches (ppm)'] = ppmArray target_fits = _calc_parameters(df, decoy=False, tol_ms2=tol_ms2) decoy_fits = _calc_parameters(df, decoy=True, tol_ms2=tol_ms2) # print target_fits # print decoy_fits target_frac_mean, target_frac_std = target_fits[0] target_ms1_mean, target_ms1_std = target_fits[1] target_ms2_mean, target_ms2_std = target_fits[2] decoy_frac_mean, decoy_frac_std = decoy_fits[0] ms2_L_limit = target_ms2_mean - target_ms2_std * 2 ms2_R_limit = target_ms2_mean + target_ms2_std * 2 sys.stderr.write("MS1_SD:%.3f MS2_SD:%.3f\n" % (target_ms1_std, target_ms2_std) ) csvout.writerow(df.columns) for i, x in df.iterrows(): # sub score S1: p-value of precursor mass error ms1Score = 2 * norm.sf(abs(x['Precursor m/z Error (ppm)'] - target_ms1_mean) / target_ms1_std) # sub score S2: counts of matched fragment peaks and complementary pairs. matching = map(lambda p: ms2_L_limit < p < ms2_R_limit, x['Nearest Matches (ppm)']) matchingScore = matching.count(True) for b in xrange(len(matching)): if b % 2 == 0 and matching[b] and matching[b+1]: matchingScore += 1 # sub score S3: probability ratio of ion intensity fraction frac = x['Fraction of Intensity Matching'] fracScore = frac # use the value from original Morpheus result frac = numpy.log(frac) frac_pdf_decoy = norm.pdf((frac - decoy_frac_mean)/decoy_frac_std) frac_pdf_target = norm.pdf((frac - target_frac_mean)/target_frac_std) fracScore = (frac_pdf_target - frac_pdf_decoy) / (frac_pdf_target + frac_pdf_decoy) # print i, matchingScore, ms1Score, fracScore # final PSM score if ms1Score >= 0.0001: x['Morpheus Score'] = matchingScore + fracScore + ms1Score else: x['Morpheus Score'] = 0 csvout.writerow(x.values)
def test_sequentially_constrained(): S = -np.identity(10)[:3] b = -6 * np.ones(3) C = constraints(S, b) W = sample(C, 5000, temps=np.linspace(0, 200, 1001)) U = np.linspace(0, 1, 101) D = sm.distributions.ECDF((ndist.cdf(W[0]) - ndist.cdf(6)) / ndist.sf(6)) plt.plot(U, D(U))
def slopes_z_stat(self): if 'slopes_z_stat' not in self._cache: zStat = self.slopes.reshape(len(self.slopes),)/self.slopes_std_err rs = {} for i in range(len(self.slopes)): rs[i] = (zStat[i],norm.sf(abs(zStat[i]))*2) self._cache['slopes_z_stat'] = rs.values() return self._cache['slopes_z_stat']
def _dpln_pdf(x, alpha, beta, nu, tau2): A1 = np.exp(alpha * nu + alpha**2 * tau2/2) A2 = np.exp(-beta*nu + beta**2 * tau2/2) term1 = A1 * x**(-alpha-1) * \ norm.cdf((np.log(x)-nu-alpha * tau2)/np.sqrt(tau2)) term2 = A2*x**(beta-1) * \ norm.sf((np.log(x)-nu+beta*tau2)/np.sqrt(tau2)) return alpha*beta/(alpha+beta)*(term2+term1)
def z_score_combine(pvals, sigma): L = len(pvals) pvals = np.array(pvals, dtype=np.float64) pvals[pvals == 1] = 1.0 - 9e-16 z = np.mean(norm.isf(pvals, loc=0, scale=1)) sz = 1.0 /L * np.sqrt(L + 2 * np.tril(sigma, k=-1).sum()) res = {'p': norm.sf(z/sz), 'OK': True} return res
def zscore_cluster(formula, methylations, covs, coef, robust=False): r = _combine_cluster(formula, methylations, covs, coef) z, L = np.mean(norm.isf(r["p"])), len(r["p"]) sz = 1.0 / L * np.sqrt(L + 2 * np.tril(r["corr"], k=-1).sum()) r["p"] = norm.sf(z / sz) r["t"], r["coef"] = r["t"].mean(), r["coef"].mean() r.pop("corr") return r
def inteFun(p): '''Function for integration''' #print('inteFun: %.4f'%(p)) #print(ZbOfProp(p)) #print('density: %.4f'%(norm.pdf(p, loc=aprop, scale=sd))) power = 1 - norm.sf(ZbOfProp(p)) #print('power: %.4f'%(power)) return power * norm.pdf(p, loc=aprop, scale=sd)
def p_label(bhattacharya,dist_truth,dist_measuring): bcc = "BCC: {:.3g}".format(1-bhattacharya) mu,sigma = np.mean(dist_truth),np.std(dist_truth) z_scores = (dist_measuring-mu)/sigma p_values = norm.sf(abs(z_scores))*2 #twosided p_value = np.mean(p_values) p_value_label = r"$<p_{\mathrm{i.i.d.}}>$" to_ret = "{:s}\n".format(bcc) + p_value_label + ":{:.2g}".format(p_value) return to_ret
def z_stat(self): if 'z_stat' not in self._cache: variance = self.vm.diagonal() zStat = self.betas.reshape(len(self.betas),) / np.sqrt(variance) rs = {} for i in range(len(self.betas)): rs[i] = (zStat[i], norm.sf(abs(zStat[i])) * 2) self._cache['z_stat'] = rs.values() return self._cache['z_stat']
def get_zI(I, ei, vi): """ Standardized I Returns two-sided p-values as provided in the GeoDa family """ z = abs((I - ei) / np.sqrt(vi)) pval = norm.sf(z) * 2. return (z, pval)
def testUnifSpaceParamsWithShiftAndStretch(self): pts, weights = gt.unif_spaced_param(15, -7.0, 7.0, 1, 2, False) for pt, weight in zip(pts, weights): if np.isclose(pt,-7.0): self.assertTrue(np.isclose(weight, norm.cdf(-6.5, loc=1, scale=2))) elif np.isclose(pt,7.0): self.assertTrue(np.isclose(weight, norm.sf(6.5, loc=1, scale=2))) else: self.assertTrue(np.isclose(weight, norm.cdf(pt+0.5, loc=1, scale=2) -norm.cdf(pt-0.5, loc=1, scale=2)))
def ci_test_gauss(data_matrix, x, y, s, **kwargs): assert 'corr_matrix' in kwargs cm = kwargs['corr_matrix'] n = data_matrix.shape[0] z = zstat(x, y, list(s), cm, n) p_val = 2.0 * norm.sf(np.absolute(z)) return p_val
def test_bimodality(x, bins=30, kde=True, plot=False): """Test for bimodal distribution.""" from scipy.stats import gaussian_kde, norm lb, ub = np.min(x), np.percentile(x, 99.9) grid = np.linspace(lb, ub if ub <= lb else np.max(x), bins) kde_grid = ( gaussian_kde(x)(grid) if kde else np.histogram(x, bins=grid, density=True)[0] ) idx = int(bins / 2) - 2 idx += np.argmin(kde_grid[idx : idx + 4]) peak_0 = kde_grid[:idx].argmax() peak_1 = kde_grid[idx:].argmax() kde_peak = kde_grid[idx:][ peak_1 ] # min(kde_grid[:idx][peak_0], kde_grid[idx:][peak_1]) kde_mid = kde_grid[idx:].mean() # kde_grid[idx] t_stat = (kde_peak - kde_mid) / np.clip(np.std(kde_grid) / np.sqrt(bins), 1, None) p_val = norm.sf(t_stat) grid_0 = grid[:idx] grid_1 = grid[idx:] means = [ (grid_0[peak_0] + grid_0[min(peak_0 + 1, len(grid_0) - 1)]) / 2, (grid_1[peak_1] + grid_1[min(peak_1 + 1, len(grid_1) - 1)]) / 2, ] if plot: color = "grey" if kde: pl.plot(grid, kde_grid, color=color) pl.fill_between(grid, 0, kde_grid, alpha=0.4, color=color) else: pl.hist(x, bins=grid, alpha=0.4, density=True, color=color) pl.axvline(means[0], color=color) pl.axvline(means[1], color=color) pl.axhline(kde_mid, alpha=0.2, linestyle="--", color=color) pl.show() return t_stat, p_val, means # ~ t_test (reject unimodality if t_stat > 3)
def spiegelhalter(y_true, y_score): import numpy as np from scipy.stats import norm try: if type(y_true) is not np.ndarray: y_true = y_true.values.ravel() top = np.sum((y_true - y_score) * (1 - 2 * y_score)) bot = np.sum((1 - 2 * y_score)**2 * y_score * (1 - y_score)) sh = top / np.sqrt(bot) # https://en.wikipedia.org/wiki/Z-test # Two-tailed test # Re: p-value, higher the better Goodness-of-Fit p_value = norm.sf(np.abs(sh)) * 2 return p_value except: return 0
def age_stratification(sextable, sex_assign): by_age = filter(lambda z: z[8] is not None, sorted(sextable, key=operator.itemgetter(8))) mid_age = int(by_age[1 + len(by_age) / 2][8]) young_m = 0 young_f = 0 old_m = 0 old_f = 0 for i, _a in enumerate(by_age): age = _a[8] if age is None: continue if age < mid_age: if sex_assign[i] == 0: young_m += 1 elif sex_assign[i] == 1: young_f += 1 else: if sex_assign[i] == 0: old_m += 1 elif sex_assign[i] == 1: old_f += 1 # two proportion z-test, normal approx., one sided n1 = young_m + young_f n2 = old_m + old_f p1 = float(young_m) / n1 p2 = float(old_m) / n2 phat = (p1 * n2 + p2 * n1) / (n1 + n2) z = (p2 - p1) / math.sqrt(phat * (1.0 - phat) * (1.0 / n1 + 1.0 / n2)) if z < 0: pval = 1.0 else: pval = norm.sf(z) #print(n1,n2,p1,p2,phat,z,pval) print( "{} ({} m, {} f) are younger than {} yBP\n{} ({} m, {} f) are the same age or older than {} yBP" .format(young_m + young_f, young_m, young_f, mid_age, old_m + old_f, old_m, old_f, mid_age)) print( "Do older samples have a greater male bias? z={:.2f}, p={:.2f}".format( z, pval))
def scatter_plot(args, ps_tc_results, mpbs_name_list, conditions): tf_activity_score1 = np.zeros(len(mpbs_name_list)) tf_activity_score2 = np.zeros(len(mpbs_name_list)) for i, mpbs_name in enumerate(mpbs_name_list): tf_activity_score1[i] = float(ps_tc_results[i][0][0]) + float( ps_tc_results[i][1][0]) tf_activity_score2[i] = float(ps_tc_results[i][0][1]) + float( ps_tc_results[i][1][1]) tf_activity_score = np.subtract(tf_activity_score2, tf_activity_score1) z_score = zscore(tf_activity_score) p_values = norm.sf(abs(z_score)) * 2 # add TF activity score, z score and p values to the result dictionary for i, mpbs_name in enumerate(mpbs_name_list): ps_tc_results[i].append( [tf_activity_score[i], z_score[i], p_values[i]]) # plot TF activity score x_axis = np.random.uniform(low=-0.1, high=0.1, size=len(p_values)) fig, ax = plt.subplots(figsize=(10, 12)) for i, mpbs_name in enumerate(mpbs_name_list): if p_values[i] < args.fdr: ax.scatter(x_axis[i], tf_activity_score[i], c="red") ax.annotate(mpbs_name, (x_axis[i], tf_activity_score[i]), alpha=0.6) else: ax.scatter(x_axis[i], tf_activity_score[i], c="black", alpha=0.6) ax.margins(0.05) ax.set_xticks([]) ax.set_ylabel("Activity Score \n {} $\longleftrightarrow$ {}".format( conditions[0], conditions[1]), rotation=90, fontsize=20) figure_name = os.path.join(args.output_location, "{}_statistics.pdf".format(args.output_prefix)) fig.savefig(figure_name, format="pdf", dpi=300) return ps_tc_results
def generate_intervals(self): X2, Y2 = self.X2[:, self.active_set], self.Y2 if len(self.active_set) > 0 and len(self.active_set) < X2.shape[0]: s = len(self.active_set) X2i = np.linalg.inv(X2.T.dot(X2)) beta2 = X2i.dot(X2.T.dot(Y2)) resid2 = Y2 - X2.dot(beta2) n2 = X2.shape[0] sigma2 = np.sqrt((resid2**2).sum() / (n2 - s)) alpha = 1 - self.confidence Z_quant = ndist.ppf(1 - alpha / 2) upper = beta2 + Z_quant * np.sqrt(sigma2**2 * np.diag(X2i)) lower = beta2 - Z_quant * np.sqrt(sigma2**2 * np.diag(X2i)) Zval = np.fabs(beta2) / np.sqrt(sigma2**2 * np.diag(X2i)) pval = 2 * ndist.sf(Zval) return self.active_set, lower, upper, pval else: return [], [], [], []
def calc_delong(preds1, preds2, stat, auc1=None, auc2=None): """Calculates the one-sided version of DeLong's test statistic. Args: preds1, preds2 (np.array) Vectors of continuous predicted labels. This function tests to what extent we can reject the hypothesis that `preds1` does not better predict the ground truth labels than `preds2`. stat (np.array): The ground truth binary class labels. auc1, auc2 (float, optional) Pre-computed AUCs can be given if possible which will save time. Returns: delong_val (float) """ strc1 = np.greater.outer(preds1[stat], preds1[~stat]).astype(float) strc1 += 0.5 * np.equal.outer(preds1[stat], preds1[~stat]).astype(float) strc2 = np.greater.outer(preds2[stat], preds2[~stat]).astype(float) strc2 += 0.5 * np.equal.outer(preds2[stat], preds2[~stat]).astype(float) if auc1 is None: auc1 = strc1.mean() if auc2 is None: auc2 = strc2.mean() mut_n, wt_n = strc1.shape vvecs1 = strc1.mean(axis=1), strc1.mean(axis=0) vvecs2 = strc2.mean(axis=1), strc2.mean(axis=0) smat1 = [[((vv_i[0] - auc_i) * (vv_j[0] - auc_j)).sum() / (mut_n - 1) for vv_j, auc_j in zip([vvecs1, vvecs2], [auc1, auc2])] for vv_i, auc_i in zip([vvecs1, vvecs2], [auc1, auc2])] smat2 = [[((vv_i[1] - auc_i) * (vv_j[1] - auc_j)).sum() / (wt_n - 1) for vv_j, auc_j in zip([vvecs1, vvecs2], [auc1, auc2])] for vv_i, auc_i in zip([vvecs1, vvecs2], [auc1, auc2])] smat = np.array(smat1) / strc1.shape[0] + np.array(smat2) / strc1.shape[1] z_scr = (auc1 - auc2) / np.sqrt(smat[0, 0] + smat[1, 1] - 2 * smat[1, 0]) return norm.sf(z_scr)
def dailyfc_visual(files): for onefile in files: lfpdata, chnAreas, fs = lfp_extract([onefile]) if lfpdata.shape[2] < 80: continue print(onefile) ciCOHs = calc_ciCOHs_rest(lfpdata) # permutation test: use the lfp data whose ciCOHs are the largest to get distribution [i, j] = np.unravel_index(np.argmax(ciCOHs), shape=ciCOHs.shape) lfp1, lfp2 = lfpdata[i, :, :], lfpdata[j, :, :] _, mu, std = pval_permciCOH_rest(lfp1, lfp2, ciCOHs[i, j], shuffleN=1000) pvals = norm.sf(abs(ciCOHs), loc=mu, scale=std) * 2 # multiple comparison correction, get weights reject, pval_corr = fdr_correction(pvals, alpha=0.05, method='indep') [rows, cols] = np.where(reject == True) weight = np.zeros(ciCOHs.shape) if len(rows) > 0: weight[rows, cols] = ciCOHs[rows, cols] # visual and save filename = os.path.basename(onefile) datestr = re.search('[0-9]{8}', filename).group() cond = re.search('_[a-z]*_[0-9]{8}', filename).group()[1:-9] save_prefix = 'all' saveFCGraph = os.path.join( savefolder, cond + '_' + save_prefix + '_' + datestr + '.png') weight_visual_save(weight, chnInf=assign_coord2chnArea( area_coord_file=area_coord_file, chnAreas=chnAreas), savefile=saveFCGraph, texts=None, threds_edge=None)
def fdr( zscores, q=.1, cV=1, invert_zscores=False, mask=None ): """ Adapted from https://brainder.org/2011/09/05/fdr-corrected-fdr-adjusted-p-values/ using a default value of cV """ if mask is None: mask = np.ones(zscores.shape, dtype=bool) inv = -1 if invert_zscores else 1 zscores = inv * zscores mask *= (zscores != 0) zscores = zscores[mask] pvals = norm.sf(zscores) oidx = np.argsort( pvals ) pvals = pvals[oidx] V = pvals.size idx = np.arange(1, V+1) thrline = idx * q / ( V * cV ) select = pvals <= thrline if len(pvals[select]): thr = np.max(pvals[select]) zthr = zscores[oidx][select][-1] * inv else: thr = None zthr = None pcor = pvals * V * cV / idx oidx_r = np.argsort(oidx) padj = np.zeros(len(pvals)) prev = 1 for i in idx[::-1]: padj[i-1] = np.min( [prev, pvals[i-1] * V * cV / i] ) prev = padj[i-1] return thr, zthr, pvals, thrline, pcor, padj
def __soft_shuffle(self, aln, shuffle): """ Soft shuffles a given alignment aln and calculates the z-score based p-value for the energy of the consensus sequence. The soft shuffle applies at least 0.1*len(aln) and at most 0.4*len(aln) changes to the alignment. Keyword arguments: aln -- query alignment that has to be shuffled mfe -- the mfe of the corresponding consensus secondary structure covar -- covariance score of the corresponding consensus secondary structure shuffle -- determines how many different sequences are generated """ aln = [str(x.seq) for x in aln] mfe, covar, structure = self.__rna_alifold(aln) aln = list(map(list, aln)) wi = len(aln[0]) min_shuffle = int(wi * 0.1) mfes = [mfe - covar] for i in range(0, shuffle): # print("z: {}".format(i)) k = random.randint(min_shuffle, min_shuffle + int( (wi - min_shuffle) * 0.45)) ary = np.array(aln).T for j in range(0, k): p = random.sample(range(wi), 2) tmp = np.copy(ary[p[0]]) ary[p[0]] = ary[p[1]] ary[p[1]] = tmp new_ary = list(map("".join, ary.T)) mfe, covar, structure = self.__rna_alifold(new_ary) mfes.append(mfe - covar) a = np.array(mfes) z = zscore(a)[0] p_values = norm.sf(abs(z)) * 2 return z, p_values
def score(self, X, nbhds, nn_matrix=None): k = len(nbhds[0]) super().score(X, nbhds) # handle multi-test factor determination # Wilcoxon rank sum testa # overall_exprs = X.todense().transpose().tolist() n_genes = X.shape[1] if nn_matrix is None: nn_matrix = to_sparse_adjacency(nbhds, n_cells=X.shape[0]) wts = rankdata(X.todense(), axis=0) # gene rankings wts = nn_matrix @ wts # nbhd_ranksums; only want to store one big matrix n1 = k n2 = X.shape[0] - k sd = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0) meanrank = n1 * n2 / 2.0 # #sign is pos if mean rank is higher than average, negative otherwise. # signs = 2*(wts >= meanrank).astype('int') - 1 wts = wts - ((n1 * (n1 + 1)) / 2.0) # calc U for x, u1 is_neg = (wts < meanrank) # remember where it was negative wts = np.maximum(wts, n1 * n2 - wts) # bigu wts = ((wts - meanrank) / sd) # z values wts = 2 * norm.sf(np.abs(wts)) # p values if self.corrector is not None: wts = self.corrector.correct(wts) wts = -1 * np.log(wts) # convert to info scores # sign them wts[is_neg] *= -1 return (csr_matrix(wts))
def test_multi_cluster_stats(): shape = (9, 10, 11) data = np.random.randn(*shape) threshold = norm.sf(data.max() + 1) data[2:4, 5:7, 6:8] = np.maximum(10, data.max() + 2) data[6:7, 8:9, 9:10] = np.maximum(11, data.max() + 1) stat_img = nib.Nifti1Image(data, np.eye(4)) mask_img = nib.Nifti1Image(np.ones(shape), np.eye(4)) # test 1 clusters, _ = cluster_stats(stat_img, mask_img, threshold, height_control='fpr', cluster_th=0) assert_true(len(clusters) == 2) cluster = clusters[1] assert_true(cluster['size'] == 1) assert_array_almost_equal(cluster['z_score'], 11) assert_array_almost_equal(cluster['maxima'], np.array([[6, 8, 9]]))
def percentile_from_sigma(sigma, lower): """ Converts a limit in standard deviation into the corresponding percentile. This function assumes a two sided interval, e.g., sigma == 2 and lower == False will return 0.977. Arguments: sigma {float} -- Number of standard deviations lower {bool} -- If lower == True returns the lower percentile, if False the upper percentile will be returned Returns: float -- The percentile as a value between 0 and 1 """ percentile = -1 if lower: percentile = norm.sf(sigma) else: percentile = norm.cdf(sigma) return percentile
def gen_correlated(sigma, n, observed=None): """ generate autocorrelated data according to the matrix sigma. if X is None, then data will be sampled from the uniform distibution. Otherwise, it will be sampled from X. Where X is then *all* observed p-values. """ C = np.matrix(chol(sigma)) if observed is None: X = np.random.uniform(0, 1, size=(n, sigma.shape[0])) else: assert n * sigma.shape[0] < observed.shape[0] idxs = np.random.random_integers(0, len(observed) - 1, size=sigma.shape[0] * n) X = observed[idxs].reshape((n, sigma.shape[0])) Q = np.matrix(qnorm(X)) for row in np.array(1 - norm.sf((Q * C).T)).T: yield row
def pval_perm_dynciCOH_SKT(dynciCOH, lfptrials): """ pvalues using permutation test for dynamic ciCOHs Arg: dynciCOH: dynamic ciCOHs [nchns * nchns * ntemp] lfptrials: the lfp trial data calculatingthe dynciCOH Return: pvals: p-value for each value in dynciCOH, shape = dynciCOH.shape """ # [i, j, _] = np.unravel_index(np.argmax(dynciCOH), shape=dynciCOH.shape) lfp1, lfp2 = lfptrials[i, :, :], lfptrials[j, :, :] mu, std = permdist_dynciCOH_SKT(lfp1, lfp2, shuffleN=100) pvals = norm.sf(abs(dynciCOH), loc=mu, scale=std) * 2 return pvals
def plot_1b(eb_n0_dB: array): """ Plots capacity vs Eb/N0 for 1 bit hard quatization """ eb_n0 = 10**(eb_n0_dB / 10) p = norm.sf(sqrt(2 * eb_n0)) h2 = -p * log2(p) - (1 - p) * log2(1 - p) c = 1 - h2 ax = plt.subplot(111) ax.plot(eb_n0_dB, c, label="1 bit") plt.ticklabel_format(style='plain', axis='x', scilimits=(0, 0)) ax.set_xlim(min(eb_n0_dB), max(eb_n0_dB)) # ax.set_ylim(0,1.05) ax.set_xlabel("Eb/N0 [dB]") ax.set_ylabel("Channel capacity [bits per channel use]") ax.grid() return ax
def bh_graph(n=5000, alpha=0.1): rhos = np.arange(0, 1.05, 0.05) means = np.zeros(len(rhos)) stds = np.zeros(len(rhos)) for j, rho in enumerate(rhos): print(rho) q = np.zeros(n) for i in range(n): x = generate_data(rho=rho) p_vals = norm.sf(x) rejected = fdr_bh(p_vals, alpha) q[i] = len(rejected[rejected < m0]) / max(len(rejected), 1) means[j] = np.mean(q) stds[j] = np.std(q) plt.errorbar(rhos, means, stds, linestyle='None', marker='^') plt.xticks(rhos) plt.title("mean and standard deviation as a function of rho") plt.show()
def test_BH_procedure(): def BH_cutoff(): Z = np.random.standard_normal(100) BH = stepup.BH(Z, np.identity(100), 1.) cutoff = BH.stepup_Z / np.sqrt(2) return cutoff BH_cutoffs = BH_cutoff() for _ in range(50): Z = np.random.standard_normal(100) Z[:20] += 3 np.testing.assert_allclose(sorted(BHfilter(2 * ndist.sf(np.fabs(Z)), q=0.2)), sorted(stepup_selection(Z, BH_cutoffs)[1]))
def cpt_ppm_a_norm(mean, variance, alpha=0.): """ Compute a Posterior Probability Map (fixed alpha) by assuming a Gaussian distribution. Parameters ---------- mean : array_like mean value(s) of the Gaussian distribution(s) variance : array_like variance(s) of the Gaussian distribution(s) alpha : array_like, optional quantile value(s) (default=0) Returns ------- ppm : array_like Posterior Probability Map evaluated at alpha """ return norm.sf(alpha, mean, variance**.5)
def test_independent_estimator(n=100, n1=50, q=0.2, signal=3, p=100): Z = np.random.standard_normal((n, p)) Z[:, :10] += signal / np.sqrt(n) Z1 = Z[:n1] Zbar = np.mean(Z, 0) Zbar1 = np.mean(Z1, 0) perturb = Zbar1 - Zbar frac = n1 * 1. / n BH_select = stepup.BH(Zbar, np.identity(p) / n, np.sqrt((1 - frac) / (n * frac)), q=q) selected = BH_select.fit(perturb=perturb) observed_target = Zbar[selected] cov_target = np.identity(selected.sum()) / n cross_cov = -np.identity(p)[selected] / n (observed_target1, cov_target1, cross_cov1, _) = BH_select.marginal_targets(selected) assert (np.linalg.norm(observed_target - observed_target1) / np.linalg.norm(observed_target) < 1.e-7) assert (np.linalg.norm(cov_target - cov_target1) / np.linalg.norm(cov_target) < 1.e-7) assert (np.linalg.norm(cross_cov - cross_cov1) / np.linalg.norm(cross_cov) < 1.e-7) result = BH_select.selective_MLE(observed_target, cov_target, cross_cov)[0] Z = result['Zvalue'] ind_unbiased_estimator = result['unbiased'] Zbar2 = Z[n1:].mean(0)[selected] assert (np.linalg.norm(ind_unbiased_estimator - Zbar2) / np.linalg.norm(Zbar2) < 1.e-6) np.testing.assert_allclose( sorted(np.nonzero(selected)[0]), sorted(BHfilter(2 * ndist.sf(np.fabs(np.sqrt(n1) * Zbar1)))))
def computePvalueProportion(att_name, att_value, current_file, top_K, round_default=2): """ Compute p-value using Proportion oracle, i.e., z-test method of 4.1.3 in "A survey on measuring indirect discrimination in machine learning". Attributes: att_name: sensitive attribute name att_value: value of protected group of above attribute current_file: file name that stored the data (with out ".csv" suffix) top_K: threshold to decide the positive outcome. Ranked inside top_K is positive outcome. Otherwise is negative outcome. round_default: threshold of round function for the returned p-value Return: rounded p-value """ # using z-test method of 4.1.3 in "A survey on measuring indirect discrimination in machine learning" # for binary attribute only data = pd.read_csv(current_file + "_weightsum.csv") total_N = len(data) top_data = data[0:top_K] # for attribute value, compute the current pairs and estimated fair pairs position_lists_val = data[data[att_name] == att_value].index + 1 size_vi = len(position_lists_val) size_other = total_N - size_vi size_vi_top = len(top_data[top_data[att_name] == att_value].index + 1) size_other_top = top_K - size_vi_top p_vi_top = size_vi_top / size_vi p_other_top = size_other_top / size_other p_vi_rest = 1 - p_vi_top p_other_rest = 1 - p_other_top pooledSE = sqrt((p_vi_top * p_vi_rest / size_vi) + (p_other_top * p_other_rest / size_other)) z_test = (p_other_top - p_vi_top) / pooledSE p_value = norm.sf(z_test) return round(p_value, round_default)
def fitted_endog(self): """ E(y|x, cond) cond for left-truncated: y > left-truncated value cond for left-truncated: y < right-truncated value cond for left- & right-truncated: left-truncated value < y < right-truncated value Non-linear fitted endog variables (conditional expectations) """ s = self.params[-1] sigma = np.exp(s) Xb = self.fittedvalues _l = self.model.left _r = self.model.right check_left = self.model.cens.unique().min() check_right = self.model.cens.unique().max() if (check_left == -1) & (check_right == 0): first_term = (Xb - _l) * norm.cdf(Xb, loc=_l, scale=sigma) second_term = sigma * norm.pdf(Xb, loc=_l, scale=sigma) return _l + first_term + second_term elif (check_left == 0) & (check_right == 1): first_term = (Xb - _r) * norm.sf(Xb, loc=_r, scale=sigma) second_term = sigma * norm.pdf(Xb, loc=_r, scale=sigma) return _r + first_term - second_term elif (check_left == -1) & (check_right == 1): first_term = (Xb - _l) * norm.cdf(Xb, loc=_l, scale=sigma) second_term = (Xb - _r) * norm.cdf(Xb, loc=_r, scale=sigma) third_term = sigma * (norm.pdf(Xb, loc=_l, scale=sigma) - norm.pdf(Xb, loc=_r, scale=sigma)) return _l + first_term - second_term + third_term else: warnings.warn( '\n\n**********************************************************************\n\n' + 'Equivalent to fitted_endog of uncensored Maximum Likelihood Estimation\n\n' + '**********************************************************************\n' )
def pval_permciCOH_rest(lfp1, lfp2, actciCOH, shuffleN=1000): """ Arg: lfp1, lfp2: ntemp * nsegs(ntrials) shuffleN: the total shuffle times actciCOH: an actual ciCOH value (positive or negative) Return: pval: the p-value base on permutation test mu, std: the mu and std of the fitted normal distribution """ permlfp1, permlfp2 = lfp1.copy(), lfp2.copy() permciCOHs = np.zeros(shape=(shuffleN, )) for i in range(shuffleN): # shuffle permlfp2 permlfp2 = np.transpose(permlfp2, axes=(1, 0)) np.random.shuffle(permlfp2) permlfp2 = np.transpose(permlfp2, axes=(1, 0)) permlfp = np.concatenate((np.expand_dims( permlfp1, axis=0), np.expand_dims(permlfp2, axis=0)), axis=0) ciCOHM = calc_ciCOHs_rest(permlfp) permciCOHs[i] = ciCOHM[0, 1] del ciCOHM, permlfp # Fit a normal distribution to the data: mu, std = norm.fit(permciCOHs) pval = norm.sf(abs(actciCOH), loc=mu, scale=std) return pval, mu, std
def __estimate_m(self, workload_pred, workload_std=0.0): for m in range(1, self.mst_model.m_max+1): mst_pred = self.mst_model.predict(m) delta = mst_pred - workload_pred variance = 0.0 if self.conf['mst_uncertainty_aware']: variance += self.mst_model.std**2 if self.conf['forecast_uncertainty_aware']: variance += workload_std**2 if 0 < variance: std = np.sqrt(variance) prob = norm.sf(x=0, loc=delta, scale=std) # survival function: 1-cdf if self.conf['rho'] <= prob: break elif 0 <= delta: # if uncertainty is not considered, 0 <= delta break; return m
def sample_path_u(prob, steps): t = norm.isf(prob / 2, loc=0, scale=np.sqrt(steps)) path = [0] prob_path = [prob] for k in range(steps): if path[-1] < t: path.append(path[-1] + float(norm.rvs(loc=0, scale=1, size=1))) if k < steps - 1: prob_path.append( float( np.minimum( 2 * norm.sf( t, loc=path[-1], scale=np.sqrt(steps - k - 1)), 1))) else: prob_path.append(0 if path[-1] < t else 1) else: prob_path.append(1) return prob_path
def compute_z(self, gene2zscore, fdr=0.05): res_z = [] res_p = [] for k, pca_genes, pca_weight, sig_1k in self.pca_models: z = compute_gwas_z(pca_genes, pca_weight, sig_1k, gene2zscore, self.gene2var) res_z.append(z) res_p.append(norm.sf(abs(z)) * 2) res_z = np.array(res_z) res_p = np.array(res_p) _, res_adjp, _, _ = multi.multipletests(res_p) self.res_z = res_z self.res_p = res_p self.res_adjp = res_adjp self.adj_asso_components = np.where(res_adjp < fdr)[0] self.is_computed = True return res_z, res_p, res_adjp
def HSIC_U_statistic_test(x, y, blocksize=50, nblocks=10): Btest = np.zeros(nblocks) n = len(x) for i in range(nblocks): indx1 = i * blocksize indx2 = indx1 + blocksize kx = kernelGausiano(x[indx1:indx2]) ky = kernelGausiano(y[indx1:indx2]) Btest[i] = HSIC_U_statistic(kx, ky) Btest_Statistic = sum(Btest) / float(nblocks) kx = kernelGausiano(x) ky = kernelGausiano(y) Btest_nullVar = blocksize**2 * np.var(null_samplesHsic(kx, ky, nblocks)) z_score = np.sqrt(n * nblocks) * Btest_Statistic / np.sqrt(Btest_nullVar) print("perm-pv", normaldist.sf(z_score)) ft = HSIC_U_statistic(kx, ky) st = HSIC_U_statistic(kx, kx) * HSIC_U_statistic(ky, ky) r = ft / (np.sqrt(st)) #test normaldist.sf(ustatistic) < alpha? return r
def fitted_endog(self): """ E(y|x, cond) cond for left-truncated: y > left-truncated value cond for left-truncated: y < right-truncated value cond for left- & right-truncated: left-truncated value < y < right-truncated value Non-linear fitted endog variables (conditional expectations) But, this attribute may be that useful """ s = self.params[-1] sigma = np.exp(s) Xb = self.fittedvalues _l = self.model.left _r = self.model.right if ~np.isneginf(_l) & np.isposinf(_r): first_term = Xb * norm.cdf(Xb, loc=_l, scale=sigma) second_term = sigma * norm.pdf(Xb, loc=_l, scale=sigma) return first_term + second_term elif np.isneginf(_l) & ~np.isposinf(_r): first_term = Xb * norm.sf(Xb, loc=_r, scale=sigma) second_term = sigma * norm.pdf(Xb, loc=_r, scale=sigma) return first_term - second_term elif ~np.isneginf(_l) & ~np.isposinf(_r): first_term = Xb * norm.cdf(Xb, loc=_l, scale=sigma) second_term = Xb * norm.cdf(Xb, loc=_r, scale=sigma) third_term = sigma * (norm.pdf(Xb, loc=_l, scale=sigma) - norm.pdf(Xb, loc=_r, scale=sigma)) return first_term - second_term + third_term else: warnings.warn( '\n\n**********************************************************************\n\n' + 'Equivalent to untruncated Maximum Likelihood Estimation\n\n' + '**********************************************************************\n' )
def test_bimodality(x, bins=30, kde=True, plot=False): from scipy.stats import gaussian_kde, norm grid = np.linspace(np.min(x), np.percentile(x, 99), bins) kde_grid = gaussian_kde(x)(grid) if kde else np.histogram( x, bins=grid, density=True)[0] idx = int(bins / 2) - 2 idx += np.argmin(kde_grid[idx:idx + 4]) peak_0 = kde_grid[:idx].argmax() peak_1 = kde_grid[idx:].argmax() kde_peak = kde_grid[idx:][ peak_1] # min(kde_grid[:idx][peak_0], kde_grid[idx:][peak_1]) kde_mid = kde_grid[idx:].mean() # kde_grid[idx] t_stat = (kde_peak - kde_mid) / (np.std(kde_grid) / np.sqrt(bins)) p_val = norm.sf(t_stat) grid_0 = grid[:idx] grid_1 = grid[idx:] means = [(grid_0[peak_0] + grid_0[min(peak_0 + 1, len(grid_0) - 1)]) / 2, (grid_1[peak_1] + grid_1[min(peak_1 + 1, len(grid_1) - 1)]) / 2] if plot: color = 'grey' if kde: pl.plot(grid, kde_grid, color=color) pl.fill_between(grid, 0, kde_grid, alpha=.4, color=color) else: pl.hist(x, bins=grid, alpha=.4, density=True, color=color) pl.axvline(means[0], color=color) pl.axvline(means[1], color=color) pl.axhline(kde_mid, alpha=.2, linestyle='--', color=color) pl.show() return t_stat, p_val, means # ~ t_test (reject unimodality if t_stat > 3)
def usThemHelp(trait,parms): local=parms['local'] name=parms['name'] wald=parms['wald'] snpChr=[x for x in parms['snpChr'] if x!=trait] snpData=DBLocalRead(name+'process/snpData',parms) snpData=snpData[snpData['chr']!=trait] traitData=DBLocalRead(name+'process/traitData',parms) traitData=traitData[traitData['chr']==trait] ail_paper=pd.read_csv(local+'data/ail_paper-Trans.csv',header=0) ail_paper=ail_paper[(ail_paper['eqtl_tissue']=='hip')&(ail_paper['target_gene_chrom']==int(trait[3:]))] ail_paper=ail_paper[['eqtl_pos_bp','eqtl_chrom','eqtl_pvalue','target_gene']].reset_index(drop=True) ail_paper=ail_paper.merge(pd.DataFrame({'target_gene':traitData['trait'],'loc':np.arange(len(traitData))}),on='target_gene') traitList=ail_paper['loc'].values.flatten() #pdb.set_trace() pval={} for snp in snpChr: snpChrom=int(snp[3:]) print('loading pvals from snp '+snp+' trait '+trait) pval[snpChrom]=DBRead(name+'score/p-'+snp+'-'+trait,parms)[:,traitList] if wald: pval[snpChrom]=2*norm.sf(np.abs(pval[snpChrom])) ans=[] for ind,eqtl in ail_paper.iterrows(): if not ('chr'+str(int(eqtl['eqtl_chrom'])) in snpChr): continue t_snpData=snpData[snpData['chr']=='chr'+str(int(eqtl['eqtl_chrom']))] ans+=[[eqtl['eqtl_pvalue'],np.min(pval[int(eqtl['eqtl_chrom'])][(t_snpData['Mbp']<eqtl['eqtl_pos_bp']+1e6)& (t_snpData['Mbp']>eqtl['eqtl_pos_bp']-1e6),ind].flatten())]] ans=np.array(ans) DBWrite(ans,name+'usThem/'+trait,parms)
def play(myScore, theirScore, isLast): remainingScore = 100 - myScore print('remaining: ', remainingScore) if verbose == True else None maxSafe = 0 searching = True while searching: check = maxSafe + 1 thisMean = dieMean * check thisVariance = dieVariance * check thisDeviation = numpy.sqrt(thisVariance) zScore = (remainingScore - thisMean) / thisDeviation if zScore > 1.687: overshootOdds = 0.0000001 else: overshootOdds = norm.sf(zScore) print('checked: ', check, ' overshoot prob: ', overshootOdds, zScore) if verbose == True else None if overshootOdds < risk: maxSafe = check else: searching = False return maxSafe