def HybridNormalGPDPDF(xs, u, mu, sigma, shape, loc, scale): ''' Params: xs: unsorted list of datat to fit semi-parametric PDF to. u: threshold to move from Gaussian PDF Fit in center to GPD tail fitting. mu: mean of the data. sigma: standard deviation of the data. shape: gpd least squares estimated shape parameter. loc: gpd least squares estimated location parameter. scale: gpd least squares estimated scale parameter. Returns: an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the PDF fit. ''' out = list() l = (mu - abs(u - mu)) h = (mu + abs(u - mu)) #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h)) for x in xs: if x < l: out.append( norm.cdf(l, mu, sigma) * genpareto.pdf(l - x, shape, loc=loc, scale=scale)) elif x >= h: out.append((1 - norm.cdf(h, mu, sigma)) * genpareto.pdf(x - h, shape, loc=loc, scale=scale)) else: out.append(norm.pdf(x, mu, sigma)) return out
def loss_function(abg): # #SIMPLE: Penalize negative a's, we want a positive, b/c for a<0, the algorithm is different: # if min(abg)<.0 or max(abg) > 5.: # return 1e6# error = .0; for (phi_m, phi_idx) in zip(phis, xrange(N_phi)): Is = bins[phi_m]['Is'] uniqueIs = bins[phi_m]['unique_Is'] a,b,g = abg[0], abg[1], abg[2] movingThreshold = getMovingThreshold(a,g, phi_m) LHS_numerator = movingThreshold(uniqueIs[1:]) *sqrt(2.) LHS_denominator = b * sqrt(1 - exp(-2*uniqueIs[1:])) LHS = 1 - norm.cdf(LHS_numerator / LHS_denominator) RHS = zeros_like(LHS) N = len(Is) for rhs_idx in xrange(1,len(uniqueIs)): t = uniqueIs[rhs_idx] lIs = Is[Is<t] taus = t - lIs; numerator = (movingThreshold(t) - movingThreshold(lIs)* exp(-taus)) * sqrt(2.) denominator = b * sqrt(1. - exp(-2*taus)) RHS[rhs_idx-1] = sum(1. - norm.cdf(numerator/denominator)) / N # error += sum(abs(LHS - RHS)); error += sum((LHS - RHS)**2); # error += max(abs(LHS - RHS)) return error
def fun(params): """ Negative log-likelihood of z-scores. The function has three arguments, packed into a vector: mean : location parameter logscale : log of the scale parameter logitprop : logit of the proportion of true nulls The implementation follows section 4 from Efron 2008. """ d, s, p = xform(params) # Mass within the central region central_mass = (norm.cdf((null_ub - d) / s) - norm.cdf((null_lb - d) / s)) # Probability that a Z-score is null and is in the central region cp = p * central_mass # Binomial term rval = n_zs0 * np.log(cp) + (n_zs - n_zs0) * np.log(1 - cp) # Truncated Gaussian term for null Z-scores zv = (zscores0 - d) / s rval += np.sum(-zv**2 / 2) - n_zs0 * np.log(s) rval -= n_zs0 * np.log(central_mass) return -rval
def plotlmmse(): nsample = 10**5 snrlst = range(0, 19) plt.subplot(211) pe = [] for snr in snrlst: coeff = lmmse_coeff(tap1, 41, snr) delta_square = 10**(-snr / 10.) * sum(coeff**2) pe.append(1 - norm.cdf(sqrt((1 - 0.7)**2 / delta_square))) plt.semilogy(snrlst, pe, snrlst, equalizer(2, 41, snrlst, nsample, 'lmmse'), "-.") plt.legend(("Theoretical curve", "Simulated curve"), loc='lower left') plt.title("Theoretical vs. Simulated performances for Channel 1") plt.xlabel("SNR (dB)") plt.ylabel("SER (dB)") plt.grid(True, which='both') plt.subplot(212) pe = [] for snr in snrlst: coeff = lmmse_coeff(tap2, 41, snr) delta_square = 10**(-snr / 10.) * sum(coeff**2) pe.append(1 - norm.cdf(sqrt((1 - 0.41)**2 / delta_square))) plt.semilogy(snrlst, pe, snrlst, equalizer(2, 41, snrlst, nsample, 'lmmse'), "-.") plt.legend(("Theoretical curve", "Simulated curve"), loc='lower left') plt.title("Theoretical vs. Simulated performances for Channel 2") plt.xlabel("SNR (dB)") plt.ylabel("SER (dB)") plt.grid(True, which='both') plt.show()
def fun(params): """ Negative log-likelihood of z-scores. The function has three arguments, packed into a vector: mean : location parameter logscale : log of the scale parameter logitprop : logit of the proportion of true nulls The implementation follows section 4 from Efron 2008. """ d, s, p = xform(params) # Mass within the central region central_mass = (norm.cdf((null_ub - d) / s) - norm.cdf( (null_lb - d) / s)) # Probability that a Z-score is null and is in the central region cp = p * central_mass # Binomial term rval = n_zs0 * np.log(cp) + (n_zs - n_zs0) * np.log(1 - cp) # Truncated Gaussian term for null Z-scores zv = (zscores0 - d) / s rval += np.sum(-zv**2 / 2) - n_zs0 * np.log(s) rval -= n_zs0 * np.log(central_mass) return -rval
def DemoivreLaplaceApprox(k,l,n,p): #return [np, binP], not accurate np=float(n)*p; base=sqrt(np*(1-p)); lf=float(l); kf=float(k); return [np,norm.cdf((lf+0.5-np)/base)-norm.cdf((kf-0.5-np)/base)];
def BlackScholes(reTime, rf, S, K, sigma): d1=(log(S/K)+(rf+sigma**2/2)*reTime)/sigma*sqrt(reTime) d2=d1-sigma*sqrt(reTime) call_BS = (S*norm.cdf(d1,0,1)-K*exp(-rf*reTime)*norm.cdf(d2,0,1)) put_BS = K*exp(-rf*reTime)*norm.cdf(-d2,0,1)-S*norm.cdf(-d1,0,1) delta=norm.cdf(d1,0,1) gamma=norm.pdf(d1,0,1)/(S*sigma*sqrt(reTime)) vega=S*norm.pdf(d1)*np.sqrt(reTime) theta=-.5*S*norm.pdf(d1)*sigma/np.sqrt(reTime) return {'call_BS':call_BS,'put_BS':put_BS,'delta':delta,'gamma':gamma,'vega':vega,'theta':theta}
def QQPlot(DataValues, Alpha_CI=0.95, DataLabel='Data', FigFile='QQPlot.png'): ### Based on: https://www.tjmahr.com/quantile-quantile-plots-from-scratch/ ### Itself based on Fox book: Fox, J. (2015) ### Applied Regression Analysis and Generalized Linear Models. ### Sage Publications, Thousand Oaks, California. # Data analysis N = len(DataValues) X_Bar = np.mean(DataValues) S_X = np.std(DataValues,ddof=1) # Sort data to get the rank Data_Sorted = np.zeros(N) Data_Sorted += DataValues Data_Sorted.sort() # Compute quantiles EmpiricalQuantiles = np.arange(0.5, N + 0.5) / N TheoreticalQuantiles = norm.ppf(EmpiricalQuantiles, X_Bar, S_X) ZQuantiles = norm.ppf(EmpiricalQuantiles,0,1) # Compute data variance DataIQR = np.quantile(DataValues, 0.75) - np.quantile(DataValues, 0.25) NormalIQR = np.sum(np.abs(norm.ppf(np.array([0.25, 0.75]), 0, 1))) Variance = DataIQR / NormalIQR Z_Space = np.linspace(min(ZQuantiles), max(ZQuantiles), 100) Variance_Line = Z_Space * Variance + np.median(DataValues) # Compute alpha confidence interval (CI) Z_SE = np.sqrt(norm.cdf(Z_Space) * (1 - norm.cdf(Z_Space)) / N) / norm.pdf(Z_Space) Data_SE = Z_SE * Variance Z_CI_Quantile = norm.ppf(np.array([(1 - Alpha_CI) / 2]), 0, 1) # Create point in the data space Data_Space = np.linspace(min(TheoreticalQuantiles), max(TheoreticalQuantiles), 100) # QQPlot BorderSpace = max( 0.05*abs(Data_Sorted.min()), 0.05*abs(Data_Sorted.max())) Y_Min = Data_Sorted.min() - BorderSpace Y_Max = Data_Sorted.max() + BorderSpace Figure, Axes = plt.subplots(1, 1, figsize=(5.5, 4.5), dpi=100) Axes.plot(TheoreticalQuantiles, Data_Sorted, linestyle='none', marker='o', mew=0.5, fillstyle='none', color=(0, 0, 0), label=DataLabel) Axes.plot(Data_Space, Variance_Line, linestyle='--', color=(1, 0, 0), label='Variance :' + str(format(np.round(Variance, 2),'.2f'))) Axes.plot(Data_Space, Variance_Line + Z_CI_Quantile * Data_SE, linestyle='--', color=(0, 0, 1), label=str(int(100*Alpha_CI)) + '% CI') Axes.plot(Data_Space, Variance_Line - Z_CI_Quantile * Data_SE, linestyle='--', color=(0, 0, 1)) plt.xlabel('Theoretical quantiles (-)') plt.ylabel('Empirical quantiles (-)') plt.ylim([Y_Min, Y_Max]) plt.legend(loc='upper center', ncol=3, bbox_to_anchor=(0.5, 1.15), prop={'size':10}) plt.savefig(FigFile) plt.show() plt.close(Figure) return Variance
def testSolveIWithDispersionMatchesMorrisk0Disp(self): """The solveIWithDispersionDimensional method matches results from Morris et al 2015. """ time = np.linspace(0, 7, 1.4e5) #1000 pts per second time_step = time[1] - time[0] num_time_pts = len(time) dE = 0 freq = 0 Ru = 0 Cdl = 0 Cdl1 = 0 Cdl2 = 0 Cdl3 = 0 EStart = -0.2 ERev = 0.5 temp = 293 nu = 0.1 area = 1 coverage = 1e-11 k_0BinsUnscaled = np.linspace(-7, 7, 15) #Define wE in terms of bin widths leftBinEnds = np.empty(15) leftBinEnds[1:] = np.linspace(-6.5, 6.5, 14) leftBinEnds[0] = -np.inf rightBinEnds = np.empty(15) rightBinEnds[:-1] = np.linspace(-6.5, 6.5, 14) rightBinEnds[-1] = np.inf wK = norm.cdf(rightBinEnds, loc=0, scale=2) -\ norm.cdf(leftBinEnds, loc=0, scale=2) expWidth = np.array([0.124, 0.128, 0.134, 0.141, 0.150, 0.161, 0.172, 0.185, 0.198, 0.211]) self.assertEqual(np.sum(wK), 1) for m, ew in zip(range(1, 11), expWidth): k_0Vals = 0.1 * 2**(0.1*m*k_0BinsUnscaled) bins = [(0, k_0, w) for k_0, w in zip(k_0Vals, wK)] I, amt = st.solve_reaction_disp_dim_bins(time_step, num_time_pts, dE, freq, Ru, Cdl, Cdl1, Cdl2, Cdl3, EStart, ERev, temp, nu, area, coverage, bins) width = st.half_maximum_width(I, time, nu) err = abs(width - ew) #Rounding error + 2*step size + error in I #(estimated at 1*stepsize) self.assertLess(err, 7e-4)
def compare_medians_ms(group_1, group_2, axis=None): """ Compares the medians from two independent groups along the given axis. The comparison is performed using the McKean-Schrader estimate of the standard error of the medians. Parameters ---------- group_1 : array_like First dataset. group_2 : array_like Second dataset. axis : int, optional Axis along which the medians are estimated. If None, the arrays are flattened. If `axis` is not None, then `group_1` and `group_2` should have the same shape. Returns ------- compare_medians_ms : {float, ndarray} If `axis` is None, then returns a float, otherwise returns a 1-D ndarray of floats with a length equal to the length of `group_1` along `axis`. """ (med_1, med_2) = (ma.median(group_1, axis=axis), ma.median(group_2, axis=axis)) (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), mstats.stde_median(group_2, axis=axis)) W = np.abs(med_1 - med_2) / ma.sqrt(std_1 ** 2 + std_2 ** 2) return 1 - norm.cdf(W)
def main(): gamma = np.arange(2, 6.01, 0.01) delta = np.arange(4, 12.01, 0.01) c = np.zeros((len(gamma), len(delta))) # Calculate the expectation for coverage area for i in range(len(gamma)): for j in range(len(delta)): b = 10 * gamma[i] * log10(e) / delta[j] c[i, j] = 0.5 + exp(1 / b**2) * norm.cdf(-2 / b) print np.max(c), np.min(c) # Plotting fig = plt.figure() ax = fig.gca(projection='3d') X, Y = np.meshgrid(delta, gamma) surf = ax.plot_surface(X, Y, c, rstride=5, cstride=5, cmap=cm.jet, linewidth=1, antialiased=True) ax.set_zlim3d(np.min(c), np.max(c)) ax.set_xlabel('delta') ax.set_ylabel('gamma') ax.set_zlabel('coverage') fig.colorbar(surf) plt.show()
def compare_medians_ms(group_1, group_2, axis=None): """ Compares the medians from two independent groups along the given axis. The comparison is performed using the McKean-Schrader estimate of the standard error of the medians. Parameters ---------- group_1 : array_like First dataset. group_2 : array_like Second dataset. axis : int, optional Axis along which the medians are estimated. If None, the arrays are flattened. If `axis` is not None, then `group_1` and `group_2` should have the same shape. Returns ------- compare_medians_ms : {float, ndarray} If `axis` is None, then returns a float, otherwise returns a 1-D ndarray of floats with a length equal to the length of `group_1` along `axis`. """ (med_1, med_2) = (ma.median(group_1, axis=axis), ma.median(group_2, axis=axis)) (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), mstats.stde_median(group_2, axis=axis)) W = np.abs(med_1 - med_2) / ma.sqrt(std_1 ** 2 + std_2 ** 2) return 1 - norm.cdf(W)
def generate_ordinal(): ## Regression coefficients beta = np.zeros(5, dtype=np.float64) beta[2] = 1 beta[4] = -1 rz = 0.5 OUT = open("gee_ordinal_1.csv", "w") for i in range(200): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n, 5)) for j in range(5): x[:, j] += np.random.normal() pr = np.dot(x, beta) pr = np.array([1, 0, -0.5]) + pr[:, None] pr = 1 / (1 + np.exp(-pr)) z = rz*np.random.normal() +\ np.sqrt(1-rz**2)*np.random.normal(size=n) u = norm.cdf(z) y = (u[:, None] > pr).sum(1) for j in range(n): OUT.write("%d,%d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j, :]]) + "\n") OUT.close()
def weightedUtest(g1, w1, g2, w2): """ Determines the confidence level of the assertion: 'The values of g2 are higher than those of g1'. (adapted from the scipy.stats version) Twist: here the elements of each group have associated weights, corresponding to how often they are present (i.e. two identical entries with weight w are equivalent to one entry with weight 2w). Reference: "Studies in Continuous Black-box Optimization", Schaul, 2011 [appendix B]. TODO: make more efficient for large sets. """ from scipy.stats.distributions import norm import numpy n1 = sum(w1) n2 = sum(w2) u1 = 0. for x1, wx1 in zip(g1, w1): for x2, wx2 in zip(g2, w2): if x1 == x2: u1 += 0.5 * wx1 * wx2 elif x1 > x2: u1 += wx1 * wx2 mu = n1 * n2 / 2. sigu = numpy.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.) z = (u1 - mu) / sigu conf = norm.cdf(z) return conf
def Z_test(x1, x2, Alpha=0.95): ResultsTable = pd.DataFrame() # Compute standard deviation and number of observation S_x1 = x1.std(ddof=1) S_x2 = x2.std(ddof=1) N_x1 = len(x1) N_x2 = len(x2) # Test statistic and p value Z = (x1.mean() - x2.mean()) / np.sqrt(S_x1**2 / N_x1 + S_x2**2 / N_x2) p = 2 * (1 - norm.cdf(abs(Z))) # Rejection range MinValue = norm.ppf((1 - Alpha) / 2) MaxValue = norm.ppf(1 - (1 - Alpha) / 2) RejectionRange = np.array([[-np.inf, round(MinValue, 3)], [round(MaxValue, 3), np.inf]]) Results = { 'Test statistic': round(Z, 3), 'p value': round(p, 9), 'Significance level (%)': Alpha * 100, 'Rejection range': RejectionRange } ResultsTable = ResultsTable.append(Results, ignore_index=True) return ResultsTable
def compare_medians_ms(group_1, group_2, axis=None): """Compares the medians from two independent groups along the given axis. The comparison is performed using the McKean-Schrader estimate of the standard error of the medians. Parameters ---------- group_1 : {sequence} First dataset. group_2 : {sequence} Second dataset. axis : {integer} Axis along which the medians are estimated. If None, the arrays are flattened. Returns ------- A (p,) array of comparison values. """ (med_1, med_2) = (ma.median(group_1, axis=axis), ma.median(group_2, axis=axis)) (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), mstats.stde_median(group_2, axis=axis)) W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2) return 1 - norm.cdf(W)
def skewtest(a,axis=-1): """Tests whether the skew is significantly different from a normal distribution. Axis can equal None (ravel array first), an integer (the axis over which to operate), or a sequence (operate over multiple axes). NOTE: This function is mostly copied from scipy.stats.stats, but corrects for a major bug: the pvalue returned by SciPy is not valid when the skewness is negative! The return values also are slightly different: the skew is actually returned will the z-score is not. Returns: skewness and 2-tail z-probability """ a, axis = _chk_asarray(a, axis) if axis is None: a = ravel(a) axis = 0 skewness = skew(a,axis) n = float(a.shape[axis]) if n<8: print "skewtest only valid for n>=8 ... continuing anyway, n=",n y = skewness * sqrt(((n+1)*(n+3)) / (6.0*(n-2)) ) beta2 = ( 3.0*(n*n+27*n-70)*(n+1)*(n+3) ) / ( (n-2.0)*(n+5)*(n+7)*(n+9) ) W2 = -1 + sqrt(2*(beta2-1)) delta = 1/sqrt(log(sqrt(W2))) alpha = sqrt(2.0/(W2-1)) y = where(equal(y,0),1,y) Z = delta*log(y/alpha + sqrt((y/alpha)**2+1)) # The two-tailed p-value is twice the prob that value of a std normal r.v. # turns out to be greater than the (absolute) value of Z pvalue = 2*( 1 - norm.cdf(abs(Z)) ) assert pvalue >= 0.0 and pvalue <= 1.0 return skewness, pvalue
def MannWhitneyUTest(x,y): Nx = len(x) Ny = len(y) XData = pd.DataFrame({'Values': x, 'Group': 'Control'}, index=range(len(x))) YData = pd.DataFrame({'Values': y, 'Group': 'Test'}, index=range(len(y))) Pool = XData.append(YData, ignore_index=True) Pool['Ranks'] = Pool['Values'].rank(method='average') R1 = Pool[Pool['Group']=='Control']['Ranks'].sum() U1 = R1 - (Nx * (Nx+1)) / 2 U2 = Nx * Ny - U1 U = max(U1, U2) UMean = Nx * Ny / 2 UStd = np.sqrt((Nx * Ny * (Nx + Ny + 1)) / 12) # Transform into the z space from scipy.stats.distributions import norm z = (U - UMean) / UStd p = 2 * (1 - norm.cdf(abs(z))) return U, p
def compare_medians_ms(group_1, group_2, axis=None): """Compares the medians from two independent groups along the given axis. The comparison is performed using the McKean-Schrader estimate of the standard error of the medians. Parameters ---------- group_1 : {sequence} First dataset. group_2 : {sequence} Second dataset. axis : {integer} Axis along which the medians are estimated. If None, the arrays are flattened. Returns ------- A (p,) array of comparison values. """ (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis)) (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), mstats.stde_median(group_2, axis=axis)) W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2) return 1 - norm.cdf(W)
def skewtest(a, axis=-1): """Tests whether the skew is significantly different from a normal distribution. Axis can equal None (ravel array first), an integer (the axis over which to operate), or a sequence (operate over multiple axes). NOTE: This function is mostly copied from scipy.stats.stats, but corrects for a major bug: the pvalue returned by SciPy is not valid when the skewness is negative! The return values also are slightly different: the skew is actually returned will the z-score is not. Returns: skewness and 2-tail z-probability """ a, axis = _chk_asarray(a, axis) if axis is None: a = ravel(a) axis = 0 skewness = skew(a, axis) n = float(a.shape[axis]) if n < 8: print "skewtest only valid for n>=8 ... continuing anyway, n=", n y = skewness * sqrt(((n + 1) * (n + 3)) / (6.0 * (n - 2))) beta2 = (3.0 * (n * n + 27 * n - 70) * (n + 1) * (n + 3)) / ((n - 2.0) * (n + 5) * (n + 7) * (n + 9)) W2 = -1 + sqrt(2 * (beta2 - 1)) delta = 1 / sqrt(log(sqrt(W2))) alpha = sqrt(2.0 / (W2 - 1)) y = where(equal(y, 0), 1, y) Z = delta * log(y / alpha + sqrt((y / alpha)**2 + 1)) # The two-tailed p-value is twice the prob that value of a std normal r.v. # turns out to be greater than the (absolute) value of Z pvalue = 2 * (1 - norm.cdf(abs(Z))) assert pvalue >= 0.0 and pvalue <= 1.0 return skewness, pvalue
def copula(num_samples, rho_mat, mu_mat, methods): """Copula procedure to generate an OTU table with corrs close to rho_mat. Inputs: num_samples - int, number of samples. rho_mat - 2d arr, symmetric positive definite matrix which specifies the correlation or covariation between the otu's in the table. mu_mat - 1d arr w/ len(num_otus), mean of otu for multivariate random call. methods - list of lists w/ len(num_otus), each list has a variable number of elements. the first element in each list is the scipy.stats.distributions function like lognorm or beta. this is the function that we draw values from for the actual otu. the remaining entries are the parameters for that function in order that the function requires them. """ num_otus = len(mu_mat) # draw from multivariate normal distribution with specified parameters. # transpose so that it remains otuXsample matrix. Z = multivariate_normal(mean=mu_mat, cov=rho_mat, size=num_samples).T # using the inverse cdf of the normal distribution find where each sample # value for each otu falls in the normal cdf. U = norm.cdf(Z) # make the otu table using the methods and cdf values. ppf_args[0] is the # distribution function (eg. lognorm) whose ppf function we will use # to transform the cdf vals into the new distribution. ppf_args[1:] is the # params of the function like a, b, size, loc etc. otu_table = array([ ppf_args[0].ppf(otu_cdf_vals, *ppf_args[1:], size=num_otus) for ppf_args, otu_cdf_vals in zip(methods, U) ]) return where(otu_table > 0, otu_table, 0)
def param_table(beta, y_name, x_names, sigma=None): # basic frame frame = pd.DataFrame({ 'coeff': beta, }, index=x_names) frame = frame.rename_axis(y_name, axis=1) # handle sigma cases if sigma is None: return frame elif type(sigma) is tuple: sigr, sigc = sigma stderr = np.sqrt(np.hstack([maybe_diag(sigr), sigc])) else: stderr = np.sqrt(maybe_diag(sigma)) # confidence interval low95 = beta - z95 * stderr high95 = beta + z95 * stderr # p-value zscore = beta / stderr pvalue = 2 * (1 - norm.cdf(np.abs(zscore))) # stderr stats frame = frame.assign(stderr=stderr, low95=low95, high95=high95, pvalue=pvalue) return frame
def generate_ordinal(): ## Regression coefficients beta = np.zeros(5, dtype=np.float64) beta[2] = 1 beta[4] = -1 rz = 0.5 OUT = open("gee_ordinal_1.csv", "w") for i in range(200): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n,5)) for j in range(5): x[:,j] += np.random.normal() pr = np.dot(x, beta) pr = np.array([1,0,-0.5]) + pr[:,None] pr = 1 / (1 + np.exp(-pr)) z = rz*np.random.normal() +\ np.sqrt(1-rz**2)*np.random.normal(size=n) u = norm.cdf(z) y = (u[:,None] > pr).sum(1) for j in range(n): OUT.write("%d,%d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j,:]]) + "\n") OUT.close()
def copula(num_samples, rho_mat, mu_mat, methods): """Copula procedure to generate an OTU table with corrs close to rho_mat. Inputs: num_samples - int, number of samples. rho_mat - 2d arr, symmetric positive definite matrix which specifies the correlation or covariation between the otu's in the table. mu_mat - 1d arr w/ len(num_otus), mean of otu for multivariate random call. methods - list of lists w/ len(num_otus), each list has a variable number of elements. the first element in each list is the scipy.stats.distributions function like lognorm or beta. this is the function that we draw values from for the actual otu. the remaining entries are the parameters for that function in order that the function requires them. """ num_otus = len(mu_mat) # draw from multivariate normal distribution with specified parameters. # transpose so that it remains otuXsample matrix. Z = multivariate_normal(mean=mu_mat, cov=rho_mat, size=num_samples).T # using the inverse cdf of the normal distribution find where each sample # value for each otu falls in the normal cdf. U = norm.cdf(Z) # make the otu table using the methods and cdf values. ppf_args[0] is the # distribution function (eg. lognorm) whose ppf function we will use # to transform the cdf vals into the new distribution. ppf_args[1:] is the # params of the function like a, b, size, loc etc. otu_table = array([ppf_args[0].ppf(otu_cdf_vals, *ppf_args[1:], size=num_otus) for ppf_args, otu_cdf_vals in zip(methods, U)]) return where(otu_table > 0, otu_table, 0)
def weightedUtest(g1, w1, g2, w2): """ Determines the confidence level of the assertion: 'The values of g2 are higher than those of g1'. (adapted from the scipy.stats version) Twist: here the elements of each group have associated weights, corresponding to how often they are present (i.e. two identical entries with weight w are equivalent to one entry with weight 2w). Reference: "Studies in Continuous Black-box Optimization", Schaul, 2011 [appendix B]. TODO: make more efficient for large sets. """ from scipy.stats.distributions import norm import numpy n1 = sum(w1) n2 = sum(w2) u1 = 0. for x1, wx1 in zip(g1, w1): for x2, wx2 in zip(g2, w2): if x1 == x2: u1 += 0.5 * wx1 * wx2 elif x1 > x2: u1 += wx1 * wx2 mu = n1 * n2 / 2. sigu = numpy.sqrt(n1 * n2 * (n1 + n2 + 1) / 12.) z = (u1 - mu) / sigu conf = norm.cdf(z) return conf
def testsolveIWithDispersionMatchesMorrisE0Disp(self): """The solveIWithDispersionDimensional method reproduces the results of Morris et al 2015. """ time = np.linspace(0, 7, 7e4) #1000 pts per second time_step = time[1] - time[0] num_time_pts = len(time) dE = 0 freq = 0 Ru = 0 Cdl = 0. Cdl1 = 0. Cdl2 = 0. Cdl3 = 0. EStart = -0.2 ERev = 0.5 temp = 293 nu = 0.1 area = 1 coverage = 1e-11 E_0BinsUnscaled = np.linspace(-17.5e-3, 17.5e-3, 15) #Define wE in terms of bin widths leftBinEnds = np.empty(15) leftBinEnds[1:] = np.linspace(-16.25e-3, 16.25e-3, 14) leftBinEnds[0] = -np.inf rightBinEnds = np.empty(15) rightBinEnds[:-1] = np.linspace(-16.25e-3, 16.25e-3, 14) rightBinEnds[-1] = np.inf wE = norm.cdf(rightBinEnds, loc=0, scale=5e-3) -\ norm.cdf(leftBinEnds, loc=0, scale=5e-3) k_0Bins = {0.1 : 1.0} expWidth = np.array([0.124, 0.126, 0.129, 0.133, 0.138, 0.144, 0.151, 0.159, 0.167, 0.176]) self.assertEqual(np.sum(wE), 1) for l,ew in zip(range(1, 11), expWidth): E_0Vals = l * E_0BinsUnscaled self.assertTrue(np.isclose(E_0Vals[-1]-E_0Vals[0], l*35.e-3)) bins = [(E_0, 0.1, we) for E_0, we in zip(E_0Vals, wE)] I, amt = st.solve_reaction_disp_dim_bins(time_step, num_time_pts, dE, freq, Ru, Cdl, Cdl1, Cdl2, Cdl3, EStart, ERev, temp, nu, area, coverage, bins) width = st.half_maximum_width(I, time, nu) self.assertLess(abs(width - ew), 7e-4) #Rounding error + 2*step size + solution error (estimated at 1*step size)
def test_scoretest(self): # Regression tests np.random.seed(6432) n = 200 # Must be divisible by 4 exog = np.random.normal(size=(n, 4)) endog = exog[:, 0] + exog[:, 1] + exog[:, 2] endog += 3*np.random.normal(size=n) group = np.kron(np.arange(n/4), np.ones(4)) # Test under the null. L = np.array([[1., -1, 0, 0]]) R = np.array([0.,]) family = Gaussian() va = Independence() mod1 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt1 = mod1.fit() assert_almost_equal(mod1.score_test_results["statistic"], 1.08126334) assert_almost_equal(mod1.score_test_results["p-value"], 0.2984151086) # Test under the alternative. L = np.array([[1., -1, 0, 0]]) R = np.array([1.0,]) family = Gaussian() va = Independence() mod2 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt2 = mod2.fit() assert_almost_equal(mod2.score_test_results["statistic"], 3.491110965) assert_almost_equal(mod2.score_test_results["p-value"], 0.0616991659) # Compare to Wald tests exog = np.random.normal(size=(n, 2)) L = np.array([[1, -1]]) R = np.array([0.]) f = np.r_[1, -1] for i in range(10): endog = exog[:, 0] + (0.5 + i/10.)*exog[:, 1] +\ np.random.normal(size=n) family = Gaussian() va = Independence() mod0 = GEE(endog, exog, group, family=family, cov_struct=va) rslt0 = mod0.fit() family = Gaussian() va = Independence() mod1 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt1 = mod1.fit() se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f))) wald_z = np.dot(f, rslt0.params) / se wald_p = 2*norm.cdf(-np.abs(wald_z)) score_p = mod1.score_test_results["p-value"] assert_array_less(np.abs(wald_p - score_p), 0.02)
def test_scoretest(self): # Regression tests np.random.seed(6432) n = 200 # Must be divisible by 4 exog = np.random.normal(size=(n, 4)) endog = exog[:, 0] + exog[:, 1] + exog[:, 2] endog += 3*np.random.normal(size=n) group = np.kron(np.arange(n/4), np.ones(4)) # Test under the null. L = np.array([[1., -1, 0, 0]]) R = np.array([0.,]) family = Gaussian() va = Independence() mod1 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt1 = mod1.fit() assert_almost_equal(mod1.score_test_results["statistic"], 1.08126334) assert_almost_equal(mod1.score_test_results["p-value"], 0.2984151086) # Test under the alternative. L = np.array([[1., -1, 0, 0]]) R = np.array([1.0,]) family = Gaussian() va = Independence() mod2 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt2 = mod2.fit() assert_almost_equal(mod2.score_test_results["statistic"], 3.491110965) assert_almost_equal(mod2.score_test_results["p-value"], 0.0616991659) # Compare to Wald tests exog = np.random.normal(size=(n, 2)) L = np.array([[1, -1]]) R = np.array([0.]) f = np.r_[1, -1] for i in range(10): endog = exog[:, 0] + (0.5 + i/10.)*exog[:, 1] +\ np.random.normal(size=n) family = Gaussian() va = Independence() mod0 = GEE(endog, exog, group, family=family, cov_struct=va) rslt0 = mod0.fit() family = Gaussian() va = Independence() mod1 = GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) rslt1 = mod1.fit() se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f))) wald_z = np.dot(f, rslt0.params) / se wald_p = 2*norm.cdf(-np.abs(wald_z)) score_p = mod1.score_test_results["p-value"] assert_array_less(np.abs(wald_p - score_p), 0.02)
def calculateProbability(time, avg, sd): if sd == pd.to_timedelta(0): print( "Standard deviation is 0, not enough data points, returning p = 0") p = 0 else: z = (time - avg) / sd p = norm.cdf(z) return p
def compare_medians_ms(group_1, group_2, axis=None): """ Compares the medians from two independent groups along the given axis. The comparison is performed using the McKean-Schrader estimate of the standard error of the medians. Parameters ---------- group_1 : array_like First dataset. Has to be of size >=7. group_2 : array_like Second dataset. Has to be of size >=7. axis : int, optional Axis along which the medians are estimated. If None, the arrays are flattened. If `axis` is not None, then `group_1` and `group_2` should have the same shape. Returns ------- compare_medians_ms : {float, ndarray} If `axis` is None, then returns a float, otherwise returns a 1-D ndarray of floats with a length equal to the length of `group_1` along `axis`. Examples -------- >>> from scipy import stats >>> a = [1, 2, 3, 4, 5, 6, 7] >>> b = [8, 9, 10, 11, 12, 13, 14] >>> stats.mstats.compare_medians_ms(a, b, axis=None) 1.0693225866553746e-05 The function is vectorized to compute along a given axis. >>> import numpy as np >>> rng = np.random.default_rng() >>> x = rng.random(size=(3, 7)) >>> y = rng.random(size=(3, 8)) >>> stats.mstats.compare_medians_ms(x, y, axis=1) array([0.36908985, 0.36092538, 0.2765313 ]) References ---------- .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods for studentizing the sample median." Communications in Statistics-Simulation and Computation 13.6 (1984): 751-773. """ (med_1, med_2) = (ma.median(group_1, axis=axis), ma.median(group_2, axis=axis)) (std_1, std_2) = (mstats.stde_median(group_1, axis=axis), mstats.stde_median(group_2, axis=axis)) W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2) return 1 - norm.cdf(W)
def get_candidate_window2(read, x, y, repx, repy, threshold): # using PHI = 1e6 to prescreen the genome PHI = 1e6 GAMMA_HAT = 1.0 tau = numpy.sqrt( y*( repx*x*(PHI+y) + repy*y*(PHI+x))/repx/repy/PHI/x**3) gamma = y/x z = (numpy.log(gamma)-numpy.log(GAMMA_HAT))*gamma/tau pvalue = norm.cdf(-z) pre_idx_list = numpy.where(pvalue[10:-10]<threshold)[0]+10 return numpy.array(pre_idx_list)
def logrank_power(n, surv1, surv2, alpha=0.05): d = n * (2 - surv1 - surv2) if surv1 == 1 or surv2 == 1: return 0 elif surv1 == 0 or surv2 == 0: return -1 phi = log(surv1) / log(surv2) if surv1 < surv2 else log(surv2) / log(surv1) z_a = norm.ppf(1 - alpha) z_1_beta = sqrt(d * (1 - phi) * (1 - phi) / (1 + phi) / (1 + phi)) - z_a return norm.cdf(z_1_beta)
def loss_function_simple(abg, visualize, fig_tag = ''): error = .0; if visualize: figure() for (phi_m, phi_idx) in zip(phis, xrange(N_phi)): Is = bins[phi_m]['Is'] uniqueIs = bins[phi_m]['unique_Is'] a,b,g = abg[0], abg[1], abg[2] movingThreshold = getMovingThreshold(a,g, phi_m,binnedTrain.theta) LHS_numerator = movingThreshold(uniqueIs[1:]) *sqrt(2.) LHS_denominator = b * sqrt(1 - exp(-2*uniqueIs[1:])) LHS = 1 - norm.cdf(LHS_numerator / LHS_denominator) RHS = zeros_like(LHS) N = len(Is) for rhs_idx in xrange(1,len(uniqueIs)): t = uniqueIs[rhs_idx] lIs = Is[Is<t] taus = t - lIs; numerator = (movingThreshold(t) - movingThreshold(lIs)* exp(-taus)) * sqrt(2.) denominator = b * sqrt(1. - exp(-2*taus)) RHS[rhs_idx-1] = sum(1. - norm.cdf(numerator/denominator)) / N weight = len(Is) lerror = dot((LHS - RHS)**2 , diff(uniqueIs)) * weight; error += lerror if visualize: subplot(ceil(len(phis)/2),2, phi_idx+1);hold(True) ts = uniqueIs[1:]; plot(ts, LHS, 'b'); plot(ts, RHS, 'rx'); # annotate('$\phi$ = %.2g'%(phi_m), ((min(ts), max(LHS)/2.)), ) annotate('lerror = %.3g'%lerror,((min(ts), max(LHS)/2.)), ) if visualize: subplot(ceil(len(phis)/2),2, 1); title(fig_tag) return error
def RHS(ts): if False == iterable(ts): ts = [ts] rhs = empty_like(ts) for t, t_indx in zip(ts, xrange(size(ts))): lIs = Is[Is<t]; taus = t - lIs; numerator = (movingThreshold(t) - movingThreshold(lIs)* exp(-taus)) * sqrt(2.) denominator = b * sqrt(1. - exp(-2*taus)) rhs[t_indx] = sum(1. - norm.cdf(numerator/denominator)) / N return rhs
def get_candidate_window2(x, y, repx, repy, threshold): # using PHI = 1e6 to prescreen the genome PHI = 1e6 GAMMA_HAT = 1.0 tau = numpy.sqrt(y * (repx * x * (PHI + y) + repy * y * (PHI + x)) / repx / repy / PHI / x**3) gamma = y / x z = (numpy.log(gamma) - numpy.log(GAMMA_HAT)) * gamma / tau pvalue = norm.cdf(-z) pre_idx_list = numpy.where(pvalue[10:-10] < threshold)[0] + 10 return numpy.array(pre_idx_list)
def per_chr_nbtest(read_array, chr, swap, threshold, peaktype, difftest, start1, end1, start2, end2, test_rep, control_rep): t1 = time.time() sig_peaks_list = [] y_bar_array = numpy.mean(read_array[:, start1:end1], 1) x_bar_array = numpy.mean(read_array[:, start2:end2], 1) if swap: #swap the chip and control reads. x_bar_array, y_bar_array = y_bar_array, x_bar_array cand_index = get_candidate_window2(x_bar_array, y_bar_array, control_rep, test_rep, threshold) debug("There are %d candidate windows for %s (PID:%d)", len(cand_index), chr, os.getpid()) if not swap: disp_list = numpy.array([ estimate_area_dispersion_factor(read_array, test_rep, control_rep, idx, peaktype, difftest) for idx in cand_index ]) else: disp_list = numpy.array([ estimate_area_dispersion_factor(read_array, control_rep, test_rep, idx, peaktype, difftest) for idx in cand_index ]) #debug("finished estimating dispersion for %s", chr) # return [] cand_x_bar_array = x_bar_array[cand_index] cand_y_bar_array = y_bar_array[cand_index] gamma_array = cand_y_bar_array / cand_x_bar_array tau_hat_array = numpy.sqrt( cand_y_bar_array * ((control_rep * cand_x_bar_array * (disp_list + cand_y_bar_array)) + (test_rep * cand_y_bar_array * (disp_list + cand_x_bar_array))) / (test_rep * control_rep * disp_list * (cand_x_bar_array**3))) gamma_hat = 1.0 #Null hypothesis z_score_array = ((numpy.log(gamma_array) - numpy.log(gamma_hat)) * gamma_array / tau_hat_array) pval_array = norm.cdf(-z_score_array) test_index = numpy.where(pval_array < threshold) test_index = test_index[0] sig_index = cand_index[test_index] sig_pval = pval_array[test_index] sig_group1_count = cand_y_bar_array[test_index] sig_group2_count = cand_x_bar_array[test_index] #sig_disp = disp_list[test_index] for i, a in enumerate(test_index): sig_peaks_list.append( Peak(chr, sig_index[i], sig_group1_count[i], sig_group2_count[i], sig_pval[i], 0)) t2 = time.time() debug("Analysis finished for %s, used %f sec CPU time", chr, t2 - t1) return sig_peaks_list
def tst_importance_sampl(): """ Reducing variance using importance sampling. """ print("Probability that std normal will be greater than 2 is:" + str((1 - norm.cdf(2, 0, 1)))) print("What we get from direct simulation:" + str(sum(np.random.normal(0, 1, size=10000) > 2) / 10000)) summ = 0 for x in np.random.normal(2, 1, size=10000): summ += (x > 2) * norm.pdf(x, 0, 1) / norm.pdf(x, 2, 1) print("With importance sampling:" + str(summ / 10000))
def RHS(ts): if False == iterable(ts): ts = array([ts]) lIs = tile(Is, len(ts) ).reshape((len(ts), len(Is))).transpose() lts = tile(ts, (len(Is),1 ) ) mask = lIs < lts taus = (lts - lIs); #*mask #NOTE BELOW WE use abs(taus) since for non-positive taus we will mask away anyway: numerator = (movingThreshold(lts) - movingThreshold(lIs)* exp(-abs(taus))) * sqrt(2.) denominator = b * sqrt(1. - exp(-2*abs(taus))) rhs = sum( (1. - norm.cdf(numerator/denominator))*mask, axis=0) / N_Is return rhs
def filterExptsByPseudoCountDistr( ddict ): # remove experiments where the pseudocount is high # relative to the other pseudocounts pseudodict = { k : ddict[k]['PSEUDO'] for k in ddict } pskeys = list(pseudodict.keys()) pslogvals = np.log10(list(pseudodict.values())) pslogmad = mad(pslogvals) ; pslogmedian = np.percentile(pslogvals,50) pslvps_hi = 1-norm.cdf((pslogvals-pslogmedian)/pslogmad) rejected_ds_hi = multipletests( pslvps_hi, alpha=0.05 )[0] # return data in a dictionary filteredExpts = { pskeys[i] : rejected_ds_hi[i] for i in range(len(pskeys))} return filteredExpts
def dosim(hyp, cov_struct=None, mcrep=500): # Storage for the simulation results scales = [[], []] # P-values from the score test pv = [] # Monte Carlo loop for k in range(mcrep): # Generate random "probability points" u that are uniformly # distributed, and correlated within clusters z = np.random.normal(size=n) u = np.random.normal(size=n // m) u = np.kron(u, np.ones(m)) z = r * z + np.sqrt(1 - r**2) * u u = norm.cdf(z) # Generate the observed responses y = negbinom(u, mu=mu[hyp], scale=scale) # Fit the null model m0 = sm.GEE(y, x0, groups=grp, cov_struct=cov_struct, family=sm.families.Poisson()) r0 = m0.fit(scale='X2') scales[0].append(r0.scale) # Fit the alternative model m1 = sm.GEE(y, x, groups=grp, cov_struct=cov_struct, family=sm.families.Poisson()) r1 = m1.fit(scale='X2') scales[1].append(r1.scale) # Carry out the score test st = m1.compare_score_test(r0) pv.append(st["p-value"]) pv = np.asarray(pv) rslt = [np.mean(pv), np.mean(pv < 0.1)] return rslt, scales
def estimate_params_for_normal(x, low_bound , mu_initial, sigma_initial): """ Takes a vector x of truncated data with a known lower truncation bound and estimates the parameters of the fit of an untruncated normal distribution. code from Chris Fonnesbeck's Python data analysis tutorial on Sense https://sense.io/prometheus2305/data-analysis-in-python/files/Statistical%20Data%20Modeling.py """ # normalize vector mu_initial = float(mu_initial) sigma_initial = float(sigma_initial) #x = np.random.normal(size=10000,loc=2000,scale= 2000) x = map(lambda y: (y-mu_initial )/sigma_initial ,x) a = (low_bound - mu_initial)/sigma_initial # normalize lower bound #_ = plt.hist(x, bins=100) #plt.show() #plt.close() # We can construct a log likelihood for this function using the conditional # form trunc_norm = lambda theta, a, x: -(np.log(norm.pdf(x, theta[0], theta[1])) - np.log(1 - norm.cdf(a, theta[0], theta[1]))).sum() # For this example, we will use another optimization algorithm, the # **Nelder-Mead simplex algorithm**. It has a couple of advantages: # # - it does not require derivatives # - it can optimize (minimize) a vector of parameters # # SciPy implements this algorithm in its `fmin` function: # we have normalized data, given that the loer truncation point a # is pretty far out in the tail - the standard normal parameters are # a first good guess, i.e. 0,1 initial_guess = np.array([0,1]) sol = fmin(trunc_norm, initial_guess , args=(a, x)) print sol mean_normalized,stddev_normalized = sol[0],sol[1] mean_est =( 1 + mean_normalized ) * mu_initial stddev_est = stddev_normalized * sigma_initial print mean_est,stddev_est return mean_est,stddev_est
def estimate_params_for_normal(x, low_bound, mu_initial, sigma_initial): """ Takes a vector x of truncated data with a known lower truncation bound and estimates the parameters of the fit of an untruncated normal distribution. code from Chris Fonnesbeck's Python data analysis tutorial on Sense https://sense.io/prometheus2305/data-analysis-in-python/files/Statistical%20Data%20Modeling.py """ # normalize vector mu_initial = float(mu_initial) sigma_initial = float(sigma_initial) #x = np.random.normal(size=10000,loc=2000,scale= 2000) x = map(lambda y: (y - mu_initial) / sigma_initial, x) a = (low_bound - mu_initial) / sigma_initial # normalize lower bound #_ = plt.hist(x, bins=100) #plt.show() #plt.close() # We can construct a log likelihood for this function using the conditional # form trunc_norm = lambda theta, a, x: -(np.log(norm.pdf(x, theta[0], theta[ 1])) - np.log(1 - norm.cdf(a, theta[0], theta[1]))).sum() # For this example, we will use another optimization algorithm, the # **Nelder-Mead simplex algorithm**. It has a couple of advantages: # # - it does not require derivatives # - it can optimize (minimize) a vector of parameters # # SciPy implements this algorithm in its `fmin` function: # we have normalized data, given that the loer truncation point a # is pretty far out in the tail - the standard normal parameters are # a first good guess, i.e. 0,1 initial_guess = np.array([0, 1]) sol = fmin(trunc_norm, initial_guess, args=(a, x)) print sol mean_normalized, stddev_normalized = sol[0], sol[1] mean_est = (1 + mean_normalized) * mu_initial stddev_est = stddev_normalized * sigma_initial print mean_est, stddev_est return mean_est, stddev_est
def per_chr_nbtest(read_array, chr, swap,threshold, peaktype,difftest, start1,end1,start2,end2,test_rep,control_rep): t1 = time.time() sig_peaks_list = [] y_bar_array = numpy.mean(read_array[:, start1:end1], 1) x_bar_array = numpy.mean(read_array[:, start2:end2], 1) if swap: #swap the chip and control reads. x_bar_array, y_bar_array = y_bar_array, x_bar_array cand_index = get_candidate_window2( x_bar_array, y_bar_array, control_rep, test_rep, threshold) debug("There are %d candidate windows for %s (PID:%d)", len(cand_index), chr, os.getpid()) if not swap: disp_list = numpy.array([estimate_area_dispersion_factor(read_array, test_rep, control_rep, idx, peaktype, difftest) for idx in cand_index]) else: disp_list = numpy.array([estimate_area_dispersion_factor(read_array, control_rep, test_rep, idx, peaktype, difftest) for idx in cand_index]) #debug("finished estimating dispersion for %s", chr) # return [] cand_x_bar_array = x_bar_array[cand_index] cand_y_bar_array = y_bar_array[cand_index] gamma_array = cand_y_bar_array / cand_x_bar_array tau_hat_array = numpy.sqrt(cand_y_bar_array* ((control_rep*cand_x_bar_array*(disp_list+cand_y_bar_array)) + (test_rep*cand_y_bar_array*(disp_list+cand_x_bar_array)))/ (test_rep*control_rep*disp_list*(cand_x_bar_array**3))) gamma_hat = 1.0 #Null hypothesis z_score_array = ((numpy.log(gamma_array)-numpy.log(gamma_hat))* gamma_array/tau_hat_array) pval_array = norm.cdf(-z_score_array) test_index = numpy.where(pval_array<threshold) test_index = test_index[0] sig_index = cand_index[test_index] sig_pval = pval_array[test_index] sig_group1_count = cand_y_bar_array[test_index] sig_group2_count = cand_x_bar_array[test_index] #sig_disp = disp_list[test_index] for i, a in enumerate(test_index): sig_peaks_list.append(Peak(chr, sig_index[i], sig_group1_count[i], sig_group2_count[i], sig_pval[i], 0)) t2 = time.time() debug ("Analysis finished for %s, used %f sec CPU time", chr, t2-t1) return sig_peaks_list
def compare_medians_ms(group_1, group_2, axis=None): """Compares the medians from two independent groups along the given axis. Returns an array of p values. The comparison is performed using the McKean-Schrader estimate of the standard error of the medians. :Inputs: group_1 : sequence First dataset. group_2 : sequence Second dataset. axis : integer *[None]* Axis along which the medians are estimated. If None, the arrays are flattened. """ (med_1, med_2) = (mmedian(group_1, axis=axis), mmedian(group_2, axis=axis)) (std_1, std_2) = (stde_median(group_1, axis=axis), stde_median(group_2, axis=axis)) W = abs(med_1 - med_2) / sqrt(std_1**2 + std_2**2) return 1 - norm.cdf(W)
def param_table(beta, sigma, names): # standard errors stderr = np.sqrt(sigma.diagonal()) # confidence interval low95 = beta - z95*stderr high95 = beta + z95*stderr # p-value zscore = beta/stderr pvalue = 2*(1-norm.cdf(np.abs(zscore))) # return all return pd.DataFrame({ 'coeff': beta, 'stderr': stderr, 'low95': low95, 'high95': high95, 'pvalue': pvalue }, index=names)
def plotzfe(): """ Test Simulation 1-2 Plot theoretical vs. simulated results for zero forcing equalizer.""" nsample = 10**5 snrlst = range(0,19) # Calculate the theoretical values pe = [] coeff = zero_forcing_coeff(tap2, 41) for snr in snrlst: delta_square = 10**(-snr/10.)*sum(coeff**2) pe.append(1-norm.cdf(sqrt((1-0.41)**2/delta_square))) plt.semilogy(snrlst,pe,snrlst,equalizer(2,41,snrlst,nsample,'zfir'),"-.") plt.legend(("Theoretical curve", "41 taps simulation"), loc='lower left') plt.title("Theoretical vs. Simulated performances") plt.xlabel("SNR (dB)") plt.ylabel("SER (dB)") plt.grid(True, which='both') plt.show()
def plotdfe2(): nsample = 10**5 snrlst = range(0,19) nzf = 41 tap = tap2 pe = [] for snr in snrlst: cj,_ = dfe_coeff(tap, nzf, 41, snr) f = [0]*(len(cj)-len(tap)) + list(tap)[::-1] jmin = 1-sum(np.array(f)*cj) gamma = (1-jmin)/jmin pe.append(1-norm.cdf(sqrt(gamma))) plt.semilogy(snrlst,pe,snrlst, equalizer(2, 41, snrlst, nsample, 'dfe'), "-.") plt.legend(("Theoretical curve", "Simulated curve"), loc='lower left') plt.title("Theoretical vs. Simulated performances for Channel 1") plt.xlabel("SNR (dB)") plt.ylabel("SER (dB)") plt.grid(True, which='both') plt.show()
def kurtosistest(a, axis=-1): """Tests whether a dataset has normal kurtosis That is, test whether kurtosis=3(n-1)/(n+1). Valid only for n>20. Axis can equal None (ravel array first), an integer (the axis over which to operate), or a sequence (operate over multiple axes). NOTE: This function is mostly copied from scipy.stats.stats, but corrects for a major bug: the pvalue returned by SciPy is not valid when the kurtosis is negative! The return values also are slightly different: the kurtosis is actually returned will the z-score is not. Returns: kurtosis and 2-tail z-probability. """ a, axis = _chk_asarray(a, axis) n = float(a.shape[axis]) if n < 20: print "kurtosistest only valid for n>=20 ... continuing anyway, n=", n kurt = kurtosis(a, axis) E = 3.0 * (n - 1) / (n + 1) varkurt = 24.0 * n * (n - 2) * (n - 3) / ((n + 1) * (n + 1) * (n + 3) * (n + 5)) x = (kurt - E) / sqrt(varkurt) sqrtbeta1 = 6.0 * (n * n - 5 * n + 2) / ((n + 7) * (n + 9)) * sqrt( (6.0 * (n + 3) * (n + 5)) / (n * (n - 2) * (n - 3))) A = 6.0 + 8.0 / sqrtbeta1 * (2.0 / sqrtbeta1 + sqrt(1 + 4.0 / (sqrtbeta1**2))) term1 = 1 - 2 / (9.0 * A) denom = 1 + x * sqrt(2 / (A - 4.0)) denom = where(less(denom, 0), 99, denom) term2 = where(equal(denom, 0), term1, power((1 - 2.0 / A) / denom, 1 / 3.0)) Z = (term1 - term2) / sqrt(2 / (9.0 * A)) Z = where(equal(denom, 99), 0, Z) # The two-tailed p-value is twice the prob that value of a std normal r.v. # turns out to be greater than the (absolute) value of Z pvalue = 2 * (1 - norm.cdf(abs(Z))) assert pvalue >= 0.0 and pvalue <= 1.0 return kurt, pvalue
def autocorrelation(series, k=1, biased=True): """Returns autocorrelation of order 'k' and corresponding two-tailed pvalue. (Inspired by CLM pp.45-47) @param series: The series on which to compute autocorrelation @param k: The order to which compute autocorrelation @param biased: If False, rho_k will be corrected according to Fuller (1976) @return: rho_k, pvalue """ T = len(series) mu = mean(series) sigma = var(series) # Centered observations obs = series - mu lagged = lag(obs, k) truncated = obs[:-k] assert len(lagged) == len(truncated) # Multiplied by 'T' for numerical stability gamma_k = T * add.reduce(truncated * lagged) # Numerator gamma_0 = T * add.reduce(obs * obs) # Denominator rho_k = (gamma_k / gamma_0) if rho_k > 1.0: rho_k = 1.0 # Correct for numerical errors # The standard normal random variable Z = sqrt(T) * rho_k # Bias correction? if not biased: rho_k += (1 - rho_k**2) * (T - k) / (T - 1)**2 Z = rho_k * T / sqrt(T - k) # The two-tailed p-value is twice the prob that value of a std normal r.v. # turns out to be greater than the (absolute) value of Z pvalue = 2 * (1 - norm.cdf(abs(Z))) assert pvalue >= 0.0 and pvalue <= 1.0 return rho_k, pvalue
def autocorrelation(series, k=1, biased=True): """Returns autocorrelation of order 'k' and corresponding two-tailed pvalue. (Inspired by CLM pp.45-47) @param series: The series on which to compute autocorrelation @param k: The order to which compute autocorrelation @param biased: If False, rho_k will be corrected according to Fuller (1976) @return: rho_k, pvalue """ T = len(series) mu = mean(series) sigma = var(series) # Centered observations obs = series-mu lagged = lag(obs, k) truncated = obs[:-k] assert len(lagged) == len(truncated) # Multiplied by 'T' for numerical stability gamma_k = T*add.reduce(truncated*lagged) # Numerator gamma_0 = T*add.reduce(obs*obs) # Denominator rho_k = (gamma_k / gamma_0) if rho_k > 1.0: rho_k = 1.0 # Correct for numerical errors # The standard normal random variable Z = sqrt(T)*rho_k # Bias correction? if not biased: rho_k += (1 - rho_k**2) * (T-k)/(T-1)**2 Z = rho_k * T/sqrt(T-k) # The two-tailed p-value is twice the prob that value of a std normal r.v. # turns out to be greater than the (absolute) value of Z pvalue = 2*( 1 - norm.cdf(abs(Z)) ) assert pvalue >= 0.0 and pvalue <= 1.0 return rho_k, pvalue
def generate_logistic(): # Number of clusters nclust = 100 # Regression coefficients beta = np.array([1, -2, 1], dtype=np.float64) ## Covariate correlations r = 0.4 ## Cluster effects of covariates rx = 0.5 ## Within-cluster outcome dependence re = 0.3 p = len(beta) OUT = open("gee_logistic_1.csv", "w") for i in range(nclust): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n, p)) x = rx * np.random.normal() + np.sqrt(1 - rx**2) * x x[:, 2] = r * x[:, 1] + np.sqrt(1 - r**2) * x[:, 2] pr = 1 / (1 + np.exp(-np.dot(x, beta))) z = re*np.random.normal() +\ np.sqrt(1-re**2)*np.random.normal(size=n) u = norm.cdf(z) y = 1 * (u < pr) for j in range(n): OUT.write("%d,%d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j, :]]) + "\n") OUT.close()
def generate_logistic(): # Number of clusters nclust = 100 # Regression coefficients beta = np.array([1,-2,1], dtype=np.float64) ## Covariate correlations r = 0.4 ## Cluster effects of covariates rx = 0.5 ## Within-cluster outcome dependence re = 0.3 p = len(beta) OUT = open("gee_logistic_1.csv", "w") for i in range(nclust): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n,p)) x = rx*np.random.normal() + np.sqrt(1-rx**2)*x x[:,2] = r*x[:,1] + np.sqrt(1-r**2)*x[:,2] pr = 1/(1+np.exp(-np.dot(x, beta))) z = re*np.random.normal() +\ np.sqrt(1-re**2)*np.random.normal(size=n) u = norm.cdf(z) y = 1*(u < pr) for j in range(n): OUT.write("%d,%d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j,:]]) + "\n") OUT.close()
def kurtosistest(a,axis=-1): """Tests whether a dataset has normal kurtosis That is, test whether kurtosis=3(n-1)/(n+1). Valid only for n>20. Axis can equal None (ravel array first), an integer (the axis over which to operate), or a sequence (operate over multiple axes). NOTE: This function is mostly copied from scipy.stats.stats, but corrects for a major bug: the pvalue returned by SciPy is not valid when the kurtosis is negative! The return values also are slightly different: the kurtosis is actually returned will the z-score is not. Returns: kurtosis and 2-tail z-probability. """ a, axis = _chk_asarray(a, axis) n = float(a.shape[axis]) if n<20: print "kurtosistest only valid for n>=20 ... continuing anyway, n=",n kurt = kurtosis(a,axis) E = 3.0*(n-1) /(n+1) varkurt = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5)) x = (kurt-E)/sqrt(varkurt) sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * sqrt((6.0*(n+3)*(n+5))/ (n*(n-2)*(n-3))) A = 6.0 + 8.0/sqrtbeta1 *(2.0/sqrtbeta1 + sqrt(1+4.0/(sqrtbeta1**2))) term1 = 1 -2/(9.0*A) denom = 1 +x*sqrt(2/(A-4.0)) denom = where(less(denom,0), 99, denom) term2 = where(equal(denom,0), term1, power((1-2.0/A)/denom,1/3.0)) Z = ( term1 - term2 ) / sqrt(2/(9.0*A)) Z = where(equal(denom,99), 0, Z) # The two-tailed p-value is twice the prob that value of a std normal r.v. # turns out to be greater than the (absolute) value of Z pvalue = 2*( 1 - norm.cdf(abs(Z)) ) assert pvalue >= 0.0 and pvalue <= 1.0 return kurt, pvalue
def variance_ratio(series, q, rw_hypothesis=1): """Returns 'VR(q)' and the corresponding pvalue. VR(q) here refers to the variance ratio suggested in Campbell, Lo and MacKinlay (1997), pp.49-57. @param series: The series on which to compute VR. @param q: Number of periods of the long-horizon return in VR @param rw_hypothesis: Which null hypothesis to test against. The value must be in [0, 1, 3]. Zero is a special value under which no pvalue is reported. One and three lead to the use of RW1 and RW3. @return: VR_q [, pvalue -- if rw_hypothesis!=0 ] """ assert q > 1 T = len(series) qf = float(q) VR_q = 1.0 for k in range(1, q): # Will sum till q-1 as desired VR_q += 2.0 * (1.0 - (k / q)) * autocorrelation(series, k, biased=True)[0] # Zero is a special value under which no pvalue is reported if rw_hypothesis == 0: return VR_q # TBReplaced by n*q in pp.52-55's version... nq = float(T - 1) Z = sqrt(nq) * (VR_q - 1.0) if rw_hypothesis == 1: Z /= sqrt(2.0 * (2 * q - 1) * (q - 1) / (3.0 * q)) return VR_q, 2 * (1 - norm.cdf(abs(Z))) if rw_hypothesis == 3: raise NotImplementedError raise ValueError("'rw_hypothesis' must be in [0,1,3].")
def generate_nominal(): ## Regression coefficients beta1 = np.r_[0.5, 0.5] beta2 = np.r_[-1, -0.5] p = len(beta1) rz = 0.5 OUT = open("gee_nominal_1.csv", "w") for i in range(200): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n,p)) x[:,0] = 1 for j in range(1,x.shape[1]): x[:,j] += np.random.normal() pr1 = np.exp(np.dot(x, beta1))[:,None] pr2 = np.exp(np.dot(x, beta2))[:,None] den = 1 + pr1 + pr2 pr = np.hstack((pr1/den, pr2/den, 1/den)) cpr = np.cumsum(pr, 1) z = rz*np.random.normal() +\ np.sqrt(1-rz**2)*np.random.normal(size=n) u = norm.cdf(z) y = (u[:,None] > cpr).sum(1) for j in range(n): OUT.write("%d,%d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j,:]]) + "\n") OUT.close()
def variance_ratio(series, q, rw_hypothesis=1): """Returns 'VR(q)' and the corresponding pvalue. VR(q) here refers to the variance ratio suggested in Campbell, Lo and MacKinlay (1997), pp.49-57. @param series: The series on which to compute VR. @param q: Number of periods of the long-horizon return in VR @param rw_hypothesis: Which null hypothesis to test against. The value must be in [0, 1, 3]. Zero is a special value under which no pvalue is reported. One and three lead to the use of RW1 and RW3. @return: VR_q [, pvalue -- if rw_hypothesis!=0 ] """ assert q > 1 T = len(series) qf = float(q) VR_q = 1.0 for k in range(1, q): # Will sum till q-1 as desired VR_q += 2.0 * (1.0 - (k/q)) * autocorrelation(series, k, biased=True)[0] # Zero is a special value under which no pvalue is reported if rw_hypothesis==0: return VR_q # TBReplaced by n*q in pp.52-55's version... nq = float(T-1) Z = sqrt(nq) * (VR_q - 1.0) if rw_hypothesis==1: Z /= sqrt(2.0*(2*q - 1)*(q-1) / (3.0*q)) return VR_q, 2*( 1 - norm.cdf(abs(Z)) ) if rw_hypothesis==3: raise NotImplementedError raise ValueError("'rw_hypothesis' must be in [0,1,3].")