def test_lasso(s=5, n=100, p=50): X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.) lam_frac = 1. randomization = laplace(loc=0, scale=1.) loss = randomized.gaussian_Xfixed(X, y) random_Z = randomization.rvs(p) epsilon = 1. lam = sigma * lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0)) random_Z = randomization.rvs(p) penalty = randomized.selective_l1norm(p, lagrange=lam) sampler1 = randomized.selective_sampler_MH(loss, random_Z, epsilon, randomization, penalty) loss_args = {'mean':np.zeros(n), 'sigma':sigma} null, alt = pval(sampler1, loss_args, X, y, nonzero) return null, alt
def main(rho=0.245, n=100, p=30): X, prec, nonzero = instance(n=n, p=p, alpha=0.99, rho=rho) lam_frac = 0.1 alpha = 0.8 randomization = laplace(loc=0, scale=1.) loss = randomized.neighbourhood_selection(X) epsilon = 1. lam = 2./np.sqrt(n) * np.linalg.norm(X) * norm.isf(alpha / (2 * p**2)) random_Z = randomization.rvs(p**2 - p) penalty = randomized.selective_l1norm(p**2-p, lagrange=lam) sampler1 = randomized.selective_sampler_MH(loss, random_Z, epsilon, randomization, penalty) loss_args = {"active":sampler1.penalty.active_set, "quadratic_coef":epsilon} null, alt = pval(sampler1, loss_args, None, X, nonzero) return null, alt
def test_logistic(s=5, n=200, p=20): X, y, beta, active= logistic_instance(n=n, p=p, s=s, rho=0) nonzero = np.where(beta)[0] lam_frac = 40.8 randomization = laplace(loc=0, scale=1.) loss = randomized.logistic_Xrandom(X, y) epsilon = 1. #lam = lam_frac * np.mean(np.fabs(np.dot(X.T, (np.random.binomial(1, 1./2, (n, 10000)) - 0.5))).max(0)) lam = 70. random_Z = randomization.rvs(p) penalty = randomized.selective_l1norm(p, lagrange=lam) sampler1 = randomized.selective_sampler_MH(loss, random_Z, epsilon, randomization, penalty) sampler1.loss.fit_E(sampler1.penalty.active_set) linear_part = np.identity(p) data = np.dot(X.T, y - 1./2) loss_args = {'mean':np.zeros(p)} null, alt = pval(sampler1, loss_args, linear_part, data, nonzero) return null, alt
def set_param_vec(self, params): assert len(params) == 2, "Laplace Marginal Distribution requires exactly 2 parameters: np.array([mu, b])" self.mu = params[0] self.b = params[1] self.param_vec[0] = self.mu self.param_vec[1] = self.b assert self.b>0., "b cannot be <=0." self.laplace_obj = laplace(self.mu, self.b) return True
def _test_grid_log(self, dtype, scipy_dtype, grid_spec, error_spec): with self.test_session(): grid = _make_grid(dtype, grid_spec) actual = sm.log_cdf_laplace(grid).eval() # Basic tests. # isfinite checks for NaN and Inf. self.assertAllTrue(np.isfinite(actual)) self.assertAllTrue((actual < 0)) _check_strictly_increasing(actual) # Versus scipy. scipy_dist = stats.laplace(loc=0., scale=1.) expected = scipy_dist.logcdf(grid.astype(scipy_dtype)) self.assertAllClose( expected.astype(np.float64), actual.astype(np.float64), rtol=error_spec.rtol, atol=error_spec.atol)
ax[idx].vlines(locs, 0, w, color='C0') ax[idx].set_title('α = {}'.format(α)) plt.tight_layout() plt.show() # %% α = 10 H = stats.norm K = 5 x = np.linspace(-4, 4, 250) x_ = np.array([x] * K).T locs, w = stick_breaking_truncated(α, H, K) dist = stats.laplace(locs, 0.5) plt.plot(x, np.sum(dist.pdf(x_) * w, 1), 'C0', lw=2) plt.plot(x, dist.pdf(x_) * w, 'k--', alpha=0.7) plt.yticks([]) plt.show() # %% N = cs_exp.shape[0] K = 20 def stick_breaking(α, K): β = pm.Beta('β', 1., α, shape=K) w = β * pm.math.concatenate([[1.], tt.extra_ops.cumprod(1. - β)[:-1]]) return w
#!/usr/bin/env python # Run this code with # python laplace_hong.py from scipy import stats import numpy as np import matplotlib import matplotlib.pyplot as plt # Generate a Laplace distribution with mu = 0, and delta = 1. # Compute and print out the first few moments. dist = stats.laplace(0, 1.0) mean, var, skew, kurt = dist.stats(moments='mvsk') print "Laplace distribution with mu = 0 and delta = 1.0: " print "mean =", mean[()] print "variance =", var[()] print "skew =", skew[()] print "kurtosis =", kurt[()], "\n" # Four random draws # Calculate and print out the mean and variance # Save each draw in a file N_arr = np.array([10, 100, 1000, 10000]) r = [[],[],[],[]] for i in np.arange(0, len(N_arr)): N = N_arr[i] r[i] = dist.rvs(N) print "%d Random Samples from Laplace Distribution:" % N print "mean =", r[i].mean() print "variance =", r[i].var(), "\n"
# coding:utf8 import numpy as np from scipy import stats from scipy.special import gamma import matplotlib.pyplot as plt from pathlib import Path import pickle np.random.seed(0) root = Path('./ass2/savedoc') if not root.is_dir(): root.mkdir() C = np.sqrt(np.pi / 2) / gamma(1.5) t2 = stats.t(2) laplace = stats.laplace() plt.switch_backend('agg') # part (b) x = np.linspace(-10, 10, 1000) plt.figure(figsize=(20, 10)) plt.title("t2 distribution with C vs laplace distribution") plt.plot(x, laplace.pdf(x), 'r-', label='laplace distribution') plt.plot(x, C * t2.pdf(x), 'g--', label='t2 distribution with C') plt.legend() plt.savefig(root / 'p2b.jpg') plt.close() # part (c) num = 10000 allnum = int(5e4)
def test_lasso(s=0, n=100, p=20, weights="neutral", randomization_dist="logistic", randomization_scale=1, Langevin_steps=10000, burning=2000, X_scaled=True, covariance_estimate="nonparametric", noise="uniform"): """ weights: exponential, gamma, normal, gumbel randomization_dist: logistic, laplace """ step_size = 1. / p X, y, true_beta, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1., rho=0, scale=X_scaled, noise=noise) print 'true beta', true_beta lam_frac = 1. if randomization_dist == "laplace": randomization = laplace(loc=0, scale=1.) random_Z = randomization.rvs(p) if randomization_dist == "logistic": random_Z = np.random.logistic(loc=0, scale=1, size=p) if randomization_dist == "normal": random_Z = np.random.standard_normal(p) print 'randomization', random_Z * randomization_scale loss = lasso_randomX.lasso_randomX(X, y) epsilon = 1. / np.sqrt(n) #epsilon = 1. lam = sigma * lam_frac * np.mean( np.fabs( np.dot(X.T, np.random.standard_normal((n, 10000))) + randomization_scale * np.random.logistic(size=(p, 10000))).max(0)) lam_scaled = lam.copy() random_Z_scaled = random_Z.copy() epsilon_scaled = epsilon if (X_scaled == False): random_Z_scaled *= np.sqrt(n) lam_scaled *= np.sqrt(n) epsilon_scaled *= np.sqrt(n) penalty = randomized.selective_l1norm_lan(p, lagrange=lam_scaled) # initial solution problem = rr.simple_problem(loss, penalty) random_term = rr.identity_quadratic(epsilon_scaled, 0, -randomization_scale * random_Z_scaled, 0) solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500} initial_soln = problem.solve(random_term, **solve_args) print 'initial solution', initial_soln active = (initial_soln != 0) if np.sum(active) == 0: return [-1], [-1] inactive = ~active betaE = initial_soln[active] signs = np.sign(betaE) initial_grad = -np.dot(X.T, y - np.dot(X, initial_soln)) if (X_scaled == False): initial_grad /= np.sqrt(n) print 'initial_gradient', initial_grad subgradient = random_Z - initial_grad - epsilon * initial_soln cube = np.divide(subgradient[inactive], lam) nactive = betaE.shape[0] ninactive = cube.shape[0] beta_unpenalized = np.linalg.lstsq(X[:, active], y)[0] print 'beta_OLS onto E', beta_unpenalized obs_residuals = y - np.dot(X[:, active], beta_unpenalized) # y-X_E\bar{\beta}^E N = np.dot(X[:, inactive].T, obs_residuals) # X_{-E}^T(y-X_E\bar{\beta}_E), null statistic full_null = np.zeros(p) full_null[nactive:] = N # parametric coveriance estimate if covariance_estimate == "parametric": XE_pinv = np.linalg.pinv(X[:, active]) mat = np.zeros((nactive + ninactive, n)) mat[:nactive, :] = XE_pinv mat[nactive:, :] = X[:, inactive].T.dot( np.identity(n) - X[:, active].dot(XE_pinv)) Sigma_full = mat.dot(mat.T) else: Sigma_full = bootstrap_covariance(X, y, active, beta_unpenalized) init_vec_state = np.zeros(n + nactive + ninactive) if weights == "exponential": init_vec_state[:n] = np.ones(n) else: init_vec_state[:n] = np.zeros(n) #init_vec_state[:n] = np.random.standard_normal(n) #init_vec_state[:n] = np.ones(n) init_vec_state[n:(n + nactive)] = betaE init_vec_state[(n + nactive):] = cube def full_projection(vec_state, signs=signs, nactive=nactive, ninactive=ninactive): alpha = vec_state[:n].copy() betaE = vec_state[n:(n + nactive)].copy() cube = vec_state[(n + nactive):].copy() projected_alpha = alpha.copy() projected_betaE = betaE.copy() projected_cube = np.zeros_like(cube) if weights == "exponential": projected_alpha = np.clip(alpha, 0, np.inf) if weights == "gamma": projected_alpha = np.clip(alpha, -2 + 1. / n, np.inf) for i in range(nactive): if (projected_betaE[i] * signs[i] < 0): projected_betaE[i] = 0 projected_cube = np.clip(cube, -1, 1) return np.concatenate( (projected_alpha, projected_betaE, projected_cube), 0) Sigma = np.linalg.inv(np.dot(X[:, active].T, X[:, active])) null, alt = pval(init_vec_state, full_projection, X, obs_residuals, beta_unpenalized, full_null, signs, lam, epsilon, nonzero, active, Sigma, weights, randomization_dist, randomization_scale, Langevin_steps, step_size, burning, X_scaled) # Sigma_full[:nactive, :nactive]) return null, alt
def answer_hw(): ##question a print '-'*40 print "question a : rejsampler1d defined" print '-'*40 ##question b print '-'*40 print "question b: cauchy as reference for laplace" print '-'*40 ref = stats.cauchy(0,1) target = stats.laplace(0,1).pdf numsamples = 1000 samples, M, successrate = rejsampler1d(target,ref,numsamples) fig = plt.figure() ax = fig.add_subplot(111) n,bins,patches = ax.hist(samples,100,alpha = 0.75,normed = True,label='Laplace distribution') actual = stats.laplace.pdf(np.linspace(-6,6,100)) ax.plot(np.linspace(-6,6,100),actual,'r',label ='Cauchy distribution') plt.legend() KStest = stats.kstest(samples,'laplace',(0,1)) print "Kolmogorov-Smirnov test statistic : %f \np-value : %f " %(KStest) if KStest[1]>=0.05: keyword = 'not' else: keyword = "" print "samples are %s from laplace(0,1) distribution" %(keyword) ##question c print '-'*40 print "question c: t distribution with df =2 as reference for laplace" print '-'*40 ref = stats.t(2) target = stats.laplace(0,1).pdf numsamples = 1000 samples, M, successrate_student = rejsampler1d(target,ref,numsamples) if successrate_student < successrate: keyword = 'better' else: keyword = 'worse' print "acceptance rate is %f \n" %(successrate) print "using student's t distribution for reference is %s than using a cauchy distribution" %(keyword) ##question d print '-'*40 print "question d: novel continuous distribution" print '-'*40 ref = stats.norm(0,2) target = mytargetfunc numsamples = 5000 samples, M, successrate = rejsampler1d(target,ref,numsamples) #plot figure fig = plt.figure() ax = fig.add_subplot(111) n,bins,patches = ax.hist(samples,100,alpha = 0.75,normed = True,label = 'target function') actual = target(np.linspace(-6,6,100)) ax.plot(np.linspace(-6,6,100),actual,'r',label = 'normal distribution') plt.legend()
return ax #------------------------------------------------------------ # Set up distributions: Npts = 5000 np.random.seed(0) x = np.linspace(-6, 6, 1000) # Gaussian distribution data_G = stats.norm(0, 1).rvs(Npts) pdf_G = stats.norm(0, 1).pdf(x) # Non-Gaussian distribution distributions = [stats.laplace(0, 0.4), stats.norm(-4.0, 0.2), stats.norm(4.0, 0.2)] weights = np.array([0.8, 0.1, 0.1]) weights /= weights.sum() data_NG = np.hstack(d.rvs(int(w * Npts)) for (d, w) in zip(distributions, weights)) pdf_NG = sum(w * d.pdf(x) for (d, w) in zip(distributions, weights)) #------------------------------------------------------------ # Plot results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(hspace=0, left=0.07, right=0.95, wspace=0.05, bottom=0.15)
'name': "normal", 'a': -4.0, 'b': 4.0, 'stat': stats.norm(loc=0, scale=1), 'pf': lambda x: stats.norm(loc=0, scale=1).pdf(x) }, { 'name': "cauchy", 'a': -4.0, 'b': 4.0, 'stat': stats.cauchy(loc=0, scale=1), 'pf': lambda x: stats.cauchy(loc=0, scale=1).pdf(x) }, { 'name': "laplace", 'a': -4.0, 'b': 4.0, 'stat': stats.laplace(loc=0, scale=1 / math.sqrt(2)), 'pf': lambda x: stats.laplace(loc=0, scale=1 / math.sqrt(2)).pdf(x) }, { 'name': "uniform", 'a': -4.0, 'b': 4.0, 'stat': stats.uniform(-math.sqrt(3), 2 * math.sqrt(3)), 'pf': lambda x: stats.uniform(-math.sqrt(3), 2 * math.sqrt(3)).pdf(x) }, { 'name': "poisson", 'a': 6, 'b': 14, 'stat': stats.poisson(10), 'pf': lambda x: stats.poisson(10).pmf(np.ceil(x)) }]
return ax #------------------------------------------------------------ # Set up distributions: Npts = 5000 np.random.seed(0) x = np.linspace(-6, 6, 1000) # Gaussian distribution data_G = stats.norm(0, 1).rvs(Npts) pdf_G = stats.norm(0, 1).pdf(x) # Non-Gaussian distribution distributions = [stats.laplace(0, 0.4), stats.norm(-4.0, 0.2), stats.norm(4.0, 0.2)] weights = np.array([0.8, 0.1, 0.1]) weights /= weights.sum() data_NG = np.hstack(d.rvs(int(w * Npts)) for (d, w) in zip(distributions, weights)) pdf_NG = sum(w * d.pdf(x) for (d, w) in zip(distributions, weights)) #------------------------------------------------------------ # Plot results fig = plt.figure(figsize=(10, 5)) fig.subplots_adjust(hspace=0, left=0.05, right=0.95, wspace=0.05)
def test_kfstep(k=4, s=3, n=100, p=10, Langevin_steps=10000, burning=2000): X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0, signal=10)[:5] epsilon = 0. randomization = laplace(loc=0, scale=1.) j_seq = np.empty(k, dtype=int) s_seq = np.empty(k) left = np.ones(p, dtype=bool) obs = 0 initial_state = np.zeros(n + np.sum([i for i in range(p-k+1,p+1)])) initial_state[:n] = y.copy() mat = [np.array((n, ncol)) for ncol in range(p,p-k,-1)] curr = n keep = np.zeros(p, dtype=bool) for i in range(k): X_left = X[:,left] X_selected = X[:, ~left] if (np.sum(left)<p): P_perp = np.identity(n) - X_selected.dot(np.linalg.pinv(X_selected)) mat[i] = P_perp.dot(X_left) else: mat[i] = X mat_complete = np.zeros((n,p)) mat_complete[:, left] = mat[i] T = np.dot(mat[i].T, y) T_complete = np.dot(mat_complete.T, y) obs = np.max(np.abs(T)) keep = np.copy(~left) random_Z = randomization.rvs(T.shape[0]) T_random = T + random_Z initial_state[curr:(curr+p-i)] = T_random # initializing subgradients curr = curr + p-i j_seq[i] = np.argmax(np.abs(T_random)) s_seq[i] = np.sign(T_random[j_seq[i]]) #def find_index(v, idx1): # _sumF = 0 # _sumT = 0 # idx = idx1+1 # for i in range(v.shape[0]): # if (v[i] == False): # _sumF = _sumF + 1 # else: # _sumT = _sumT + 1 # if _sumT >= idx: break # return (_sumT + _sumF-1) T_complete[left] += random_Z left[np.argmax(np.abs(T_complete))] = False # conditioning linear_part = X[:, keep].T P = np.dot(linear_part.T, np.linalg.pinv(linear_part).T) I = np.identity(linear_part.shape[1]) R = I - P def full_projection(state, n=n, p=p, k=k): """ """ new_state = np.empty(state.shape, np.float) new_state[:n] = state[:n] curr = n for i in range(k): projection = projection_cone(p-i, j_seq[i], s_seq[i]) new_state[curr:(curr+p-i)] = projection(state[curr:(curr+p-i)]) curr = curr+p-i return new_state def full_gradient(state, n=n, p=p, k=k, X=X, mat=mat): data = state[:n] grad = np.empty(n + np.sum([i for i in range(p-k+1,p+1)])) grad[:n] = - data curr = n for i in range(k): subgrad = state[curr:(curr+p-i)] sign_vec = np.sign(-mat[i].T.dot(data) + subgrad) grad[curr:(curr + p - i)] = -sign_vec curr = curr+p-i grad[:n] += mat[i].dot(sign_vec) return grad sampler = projected_langevin(initial_state, full_gradient, full_projection, 1./p) samples = [] for i in range(Langevin_steps): if i>burning: old_state = sampler.state.copy() old_data = old_state[:n] sampler.next() new_state = sampler.state.copy() new_data = new_state[:n] new_data = np.dot(P, old_data) + np.dot(R, new_data) sampler.state[:n] = new_data samples.append(sampler.state.copy()) samples = np.array(samples) Z = samples[:,:n] pop = np.abs(mat[k-1].T.dot(Z.T)).max(0) fam = discrete_family(pop, np.ones_like(pop)) pval = fam.cdf(0, obs) pval = 2 * min(pval, 1 - pval) #stop print('pvalue:', pval) return pval
# Show the probability of a gap at least as big as 0, 0.5 and 1.0. from scipy.special import kolmogorov from scipy.stats import kstwobign kolmogorov([0, 0.5, 1.0]) # array([ 1. , 0.96394524, 0.26999967]) # Compare a sample of size 1000 drawn from a Laplace(0, 1) distribution against # the target distribution, a Normal(0, 1) distribution. from scipy.stats import norm, laplace n = 1000 np.random.seed(seed=233423) lap01 = laplace(0, 1) x = np.sort(lap01.rvs(n)) np.mean(x), np.std(x) # (-0.083073685397609842, 1.3676426568399822) # Construct the Empirical CDF and the K-S statistic Dn. target = norm(0,1) # Normal mean 0, stddev 1 cdfs = target.cdf(x) ecdfs = np.arange(n+1, dtype=float)/n gaps = np.column_stack([cdfs - ecdfs[:n], ecdfs[1:] - cdfs]) Dn = np.max(gaps) Kn = np.sqrt(n) * Dn print('Dn=%f, sqrt(n)*Dn=%f' % (Dn, Kn)) # Dn=0.058286, sqrt(n)*Dn=1.843153 print(chr(10).join(['For a sample of size n drawn from a N(0, 1) distribution:', ' the approximate Kolmogorov probability that sqrt(n)*Dn>=%f is %f' % (Kn, kolmogorov(Kn)), ' the approximate Kolmogorov probability that sqrt(n)*Dn<=%f is %f' % (Kn, kstwobign.cdf(Kn))]))
return ax #------------------------------------------------------------ # Set up distributions: Npts = 5000 np.random.seed(0) x = np.linspace(-6, 6, 1000) # Gaussian distribution data_G = stats.norm(0, 1).rvs(Npts) pdf_G = stats.norm(0, 1).pdf(x) # Non-Gaussian distribution distributions = [ stats.laplace(0, 0.4), stats.norm(-4.0, 0.2), stats.norm(4.0, 0.2) ] weights = np.array([0.8, 0.1, 0.1]) weights /= weights.sum() data_NG = np.hstack( d.rvs(int(w * Npts)) for (d, w) in zip(distributions, weights)) pdf_NG = sum(w * d.pdf(x) for (d, w) in zip(distributions, weights)) #------------------------------------------------------------ # Plot results fig = plt.figure(figsize=(10, 5)) fig.subplots_adjust(hspace=0, left=0.05, right=0.95, wspace=0.05)
import math # distribution # In probability theory and statistics, the Laplace distribution is a continuous probability distribution # named after Pierre-Simon Laplace. # It is also sometimes called the double exponential distribution, # because it can be thought of as two exponential distributions # (with an additional location parameter) spliced together back-to-back, # although the term is also sometimes used to refer to the Gumbel distribution. # The difference between two independent identically distributed exponential random variables # is governed by a Laplace distribution, as is a Brownian motion evaluated at an exponentially distributed random time. # Increments of Laplace motion or a variance gamma process evaluated over the time scale also have a Laplace distribution. ex = -1.0 scale = 2.0 distribution = stats.laplace(loc=ex, scale=scale) # calculate the real dispersion/variance for the given laplasse distribution # it is 2*scale^2: 2 * 4 = 8 dx = distribution.var() # generate 1000 values from distribution for the hist vs pdf plot values = distribution.rvs(size=1000) # x axis bounds left = -10 right = 10 # hist and probability density function plt.hist(values, 50, normed=True) x = np.linspace(left, right, num=100)
from scipy.stats import norminvgauss, laplace, poisson, cauchy, uniform import numpy as np import matplotlib.pyplot as plt import math as m sizes = [10, 50, 1000] rv_n = norminvgauss(1, 0) rv_l = laplace(scale=1 / m.sqrt(2), loc=0) rv_p = poisson(10) rv_c = cauchy() rv_u = uniform(loc=-m.sqrt(3), scale=2 * m.sqrt(3)) densities = [rv_n, rv_l, rv_p, rv_c, rv_u] names = ["Normal", "Laplace", "Poisson", "Cauchy", "Uniform"] for size in sizes: n = norminvgauss.rvs(1, 0, size=size) l = laplace.rvs(size=size, scale=1 / m.sqrt(2), loc=0) p = poisson.rvs(10, size=size) c = cauchy.rvs(size=size) u = uniform.rvs(size=size, loc=-m.sqrt(3), scale=2 * m.sqrt(3)) distributions = [n, l, p, c, u] build = list(zip(distributions, densities, names)) for histogram, density, name in build: fig, ax = plt.subplots(1, 1) ax.hist(histogram, density=True, histtype='stepfilled', alpha=0.6, color="green")
def test_lasso(s=5, n=500, p=20, randomization=laplace(0, 1)): """ Returns null and alternative values for the lasso. Model chosen by lasso (non-randomized), inference done as if we randomized. """ X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1., rho=0) print 'XTy', np.dot(X.T, y) lam_frac = 1. lam = sigma * lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0)) #penalty = glm.gaussian(X, Y, coef=1. / sigma**2, quadratic=quadratic) #loss = #problem = rr.simple_problem(loss, penalty) #solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500}) #initial_soln = problem.solve(**solve_args) clf = linear_model.Lasso( alpha=lam / (2 * float(n))) # should be alpha = lam/float(n) to be consistent clf.fit(X, y) soln = clf.coef_ active = (soln != 0) # boolean vector active_set = np.where(active)[ 0] # column numbers of covariates chosen by lasso # print 'active', active print 'active_set', active_set active_size = np.sum(active) print 'size of the active set', active_size inactive = ~active signs = np.sign(soln[active]) print 'true support', nonzero # LASSO region Ay < b pseudo_X_M = np.linalg.pinv(X[:, active]) pseudo_XT_M = np.linalg.pinv(X[:, active].T) P_M = np.dot(X[:, active], pseudo_X_M) #print 'active', X[:, active_set] #print np.dot(P_M, X[:, active_set]) A01 = np.dot(X[:, inactive].T, np.identity(n) - P_M) / lam A02 = -A01.copy() #print 'A01',A01 #print 'A02',A02 A0 = np.concatenate((A01, A02), axis=0) #print 'A0', A0 A1 = -np.dot(np.diag(signs), pseudo_X_M) A = np.concatenate((A0, A1), axis=0) #print signs #print pseudo_X_M #print A1 b01 = np.ones(p - active_size) - np.dot( np.dot(X[:, inactive].T, pseudo_XT_M), signs) b02 = np.ones(p - active_size) + np.dot( np.dot(X[:, inactive].T, pseudo_XT_M), signs) b0 = np.concatenate((b01, b02), axis=0) mat = np.linalg.inv(np.dot(X[:, active].T, X[:, active])) b1 = -lam * np.dot(np.dot(np.diag(signs), mat), signs) b = np.concatenate((b0, b1), axis=0) beta_bar = np.linalg.lstsq(X[:, active], y)[0] null, alt = [], [] for i, j in enumerate( active_set): # testing beta_i=0, corresponds to column X_j boot_samples, comparison = bootstrap(y, X, active, i, j) prob_selection = randomization_cdf(randomization, boot_samples, A, b) # print 'comparison', np.sum(comparison) # print np.asarray(comparison, dtype=int).shape num = np.inner(np.asarray(comparison, dtype=int), np.asarray(prob_selection)) #print 'num', num den = np.sum(np.asarray(prob_selection)) #print 'den', den p_value = num / den #p_value = 2 * min(p_value, 1-p_value) obs = beta_bar[i] print "observed: ", obs, "p value: ", p_value if j in nonzero: alt.append(p_value) else: null.append(p_value) return null, alt
import scipy.stats as sts get_ipython().run_line_magic('matplotlib', 'inline') import math # # Определяем распределение Лапласса # # # Информацию можно почитать [тут](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D0%BF%D1%80%D0%B5%D0%B4%D0%B5%D0%BB%D0%B5%D0%BD%D0%B8%D0%B5_%D0%9B%D0%B0%D0%BF%D0%BB%D0%B0%D1%81%D0%B0) # # Среднее:β # Дисперсия:β # In[132]: laplace_rv = sts.laplace(5) #задаем функцию sample = laplace_rv.rvs(1000) #Сделаем выборку из 1000 значений # In[53]: print(sample) # In[133]: #посчитаем среднее и дисперсию xm = 1. #минимальное значение E = 5 #среднее(мат ожидание) D = 2 #дисперсия print(E) print(D)
import numpy as np import scipy.stats as ss import matplotlib.pyplot as plt ''' sub-Gaussian / super-Gaussian 的区别在于峰态是大于还是小于零,大于零的,则尖峰比正态分布高, 相应的具有更长的尾巴,是super-Gaussian,小于零的则比正态分布要更平缓,是sub-Gaussian,在ICA中 p(z)是sub还是super的Gaussian是很重要的,至于是具体哪种sub或者super的分布就不重要了 ''' np.random.seed(0) # generate data and samples x = np.linspace(-4, 4, 500) rv1 = ss.norm() rv2 = ss.laplace(0, 1) rv3 = ss.uniform(-2, 4) # uniform的参数有点奇怪,比如这里就表示取值范围是(-2, -2+4) pdf1 = rv1.pdf(x) pdf2 = rv2.pdf(x) pdf3 = rv3.pdf(x) N = 5000 # Monte Carlo No. gaussian_x1, gaussian_x2 = rv1.rvs(N), rv1.rvs(N) laplace_x1, laplace_x2 = rv2.rvs(N), rv2.rvs(N) uniform_x1, uniform_x2 = rv3.rvs(N), rv3.rvs(N) # plots fig = plt.figure(figsize=(11, 9)) fig.canvas.set_window_title('subSuperGaussPlot') ax = plt.subplot(221)
def __init__(self, mu, b=1): self.mu = mu self.b = b self.distribution = laplace(loc=mu, scale=b)
# Take a subsample of the function evaluations to use in the fit subsample_indicator = (rand(Ny) <= SUBSAMPLE_PROBABILITY) _hgx = hgx _hgy = hgy hgx = hgx[subsample_indicator] hgy = hgy[subsample_indicator] Ny = hgy.size # Noise samples (Mix of Laplacian and Normal) # ------------------------------ laplace_indicator = (rand(Ny) <= LAPLACE_PROBABILITY) hgsigma = SHOT_NOISE * sqrt(hgy) laplace_sigma = LAPLACIAN_SIGMA_SCALE * hgsigma samples_laplace = laplace().rvs(hgy.size) * laplace_sigma normal_sigma = hgsigma samples_normal = norm().rvs(hgy.size) * normal_sigma hgnoise = ( laplace_indicator * samples_laplace + (1-laplace_indicator) * samples_normal) hgy_noisy = hgy + hgnoise # Forward model # ----------------- # lets use sin and cos # -- N = NUMBER_TERMS A = zeros((N, hgx.size)) __A = zeros((N, _hgx.size))
""" # ============================================================================= import matplotlib.pyplot as plt import numpy as np from matplotlib.gridspec import GridSpec from scipy import stats np.random.seed(565656) dists = [stats.norm(0, 1), stats.uniform(-np.sqrt(3), np.sqrt(3)), stats.cauchy(), stats.expon(1), stats.laplace(np.sqrt(2))] labels = [r'$\mathcal{N}(0, 1)$', r'$\mathcal{U}(-\sqrt{3}, \sqrt{3})$', 'Cauchy', r'$\lambda e^{-\lambda x}, \lambda = 1$', r'$\frac{\lambda}{2} e^{-\lambda\|x\|}, \lambda = \sqrt{2}$'] N = 1000 fig = plt.figure(1, clear=True) gs = GridSpec(nrows=2, ncols=3) for i, (dist, label) in enumerate(zip(dists, labels)): ax = fig.add_subplot(gs[i])
def adaptive_integrate(f1, f2, key, value): """inputs: f1: function 1 of x, function string f2: function 2 of x, function string key: distribution type of random variable, string value: parameters of random distribution, tuple outputs: y: integral value """ if key.startswith('Uniform'): # stats.uniform defined in the range of [0, 1] # we have to convert it to [-1, 1] for the definition of Legendre basis # stats.uniform(location, scale) # or we can also do arbitrary type, will work on this later f_distr = stats.uniform(-1, 2) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, -1, 1) elif key.startswith('Gaussian'): # this is for hermite polynomial basis # we can do arbitrary type by not using standard normal distribution # will work on this later f_distr = stats.norm(0, 1) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, -npy.inf, npy.inf) elif key.startswith('Gamma'): # compare the stats.gamma with the one showed in UQLab tutorial (input) # stats.gamma accepts only one value, but UQLab accepts two # we can do the location and scale to make them the same # argument "1" is for the "standardized" format # or we can do arbitrary type later # value[0]: lambda, value[1]: k (a for stats.gamma) a = value[1] loc = 0 scale = 1. / value[0] # stats.gamma uses "beta" instead of "lambda" f_distr = stats.gamma(a, loc, scale) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, 0, npy.inf) elif key.startswith('Beta'): # compare the stats.beta with the one showed in UQLab tutorial (input) # stats.beta accepts only one value, but UQLab accepts two # we can do the location and scale to make them the same # value[0]: alpha, value[1]: beta, no "loc" or "scale" needed # always in the range of [0, 1] alpha = value[0] beta = value[1] f_distr = stats.beta(alpha, beta) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, 0, 1) elif key.startswith('Exponential'): # value: lambda loc = 0 scale = 1. / value f_distr = stats.expon(loc, scale) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, 0, npy.inf) elif key.startswith('Lognormal'): # this part is very interesting # in UQLab they do Hermite for lognormal # and U the same as those from gaussian # then convert U to X using exp(U) # or they can specify arbitrary polynomial basis to be the same as here # we can do both, actually # value[0]: mu, value[1]:sigma s = value[1] loc = 0 scale = npy.exp(value[0]) f_distr = stats.lognorm(s, loc, scale) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, 0, npy.inf) elif key.startswith('Gumbel'): # compare the stats.gumbel_r with the one showed in UQLab tutorial (input) # stats.gamma accepts only one value, but UQLab accepts two # we can do the location and scale to make them the same # value[0]: mu, value[1]: beta loc = value[0] scale = value[1] f_distr = stats.gumbel_r(loc, scale) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, -npy.inf, npy.inf) elif key.startswith('Weibull'): # compare the stats.weibull_min with the one showed in UQLab tutorial (input) # stats.gamma accepts only one value, but UQLab accepts two # we can do the location and scale to make them the same # value[0]: lambda, value[1]: k k = value[1] loc = 0 scale = value[0] f_distr = stats.weibull_min(k, loc, scale) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, 0, npy.inf) elif key.startswith('Triangular'): # compare the stats.triang with the one showed in UQLab tutorial (input) # stats.gamma accepts only one value, but UQLab accepts two # we can do the location and scale to make them the same # value: c, no "loc" and "scale" needed # always in the range of [0, 1] c = value f_distr = stats.triang(c) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, 0, 1) elif key.startswith('Logistic'): # compare the stats.logistic with the one showed in UQLab tutorial (input) # stats.gamma accepts only one value, but UQLab accepts two # we can do the location and scale to make them the same # value[0]: location, value[1]: scale loc = value[0] scale = value[1] f_distr = stats.logistic(loc, scale) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, -npy.inf, npy.inf) elif key.startswith('Laplace'): # compare the stats.laplace with the one showed in UQLab tutorial (input) # stats.gamma accepts only one value, but UQLab accepts two # we can do the location and scale to make them the same # value[0]: location, value[1]: scale loc = value[0] scale = value[1] f_distr = stats.laplace(loc, scale) f0 = lambda x: f_distr.pdf(x) f = lambda x: f1(x) * f2(x) * f0(x) y = integrate.quad(f, -npy.inf, npy.inf) else: print 'other types of statistical distributsions are coming soon ...' return y[0]
def case1(index=CASE_1_ATTRIBUTE_INDEX,output=True,ret='accuracy'): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i) #parameter estimation sample_mean_word_spam = nb.take_mean_spam(train_set,index,SPAM_ATTR_INDEX) sample_mean_word_ham = nb.take_mean_ham(train_set,index,SPAM_ATTR_INDEX) sample_variance_word_spam = nb.take_variance_spam(train_set,index,SPAM_ATTR_INDEX) sample_variance_word_ham = nb.take_variance_ham(train_set,index,SPAM_ATTR_INDEX) #sample standard deviations from sample variance sample_std_dev_spam = sample_variance_word_spam ** (1/2.0) sample_std_dev_ham = sample_variance_word_ham ** (1/2.0) hits = 0.0 misses = 0.0 #number of instances corretcly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * stats.laplace(sample_mean_word_spam, sample_std_dev_spam).pdf(row[index]) posterior_ham = prior_ham * stats.laplace(sample_mean_word_ham, sample_std_dev_ham).pdf(row[index]) # whichever is greater - that will be our evaluation if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if(row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1 ): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits/(hits+misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if(is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam/is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if(guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam/guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if(is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham/is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if(guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham/guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 1 - ONE ATTRIBUTE - USING LAPLACE MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: '+str(round(mean_accuracy,5)) print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5)) print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8)) print '' print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5)) print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5)) print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8)) print '' print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5)) print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5)) print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8)) print '' print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5)) print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5)) print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8)) print '' print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5)) print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5)) print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8)) # we'll only use these return values to compute rankings # for example in script which_attribute_case_1 if ret == 'utility': return mean_accuracy * mean_precision_ham elif ret =='accuracy': return mean_accuracy else: print 'UNKNOWN METRIC: '+ret sys.exit()
from astroML.plotting import setup_text_plots setup_text_plots(fontsize=8, usetex=True) #------------------------------------------------------------ # Define the distribution parameters to be plotted delta_values = [0.5, 1.0, 2.0] linestyles = ['-', '--', ':'] mu = 0 x = np.linspace(-10, 10, 1000) #------------------------------------------------------------ # plot the distributions fig, ax = plt.subplots(figsize=(5, 3.75)) for delta, ls in zip(delta_values, linestyles): dist = laplace(mu, delta) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$\mu=%i,\ \Delta=%.1f$' % (mu, delta)) plt.xlim(-6, 6) plt.ylim(0, 1.0) plt.xlabel('$x$') plt.ylabel(r'$p(x|\mu,\Delta)$') plt.title('Laplace Distribution') plt.legend()
ax.set_xlim(-5, 5) ax.set_ylim(0, 0.7001) ax.set_ylabel('$p(x)$') ax.xaxis.set_major_formatter(plt.NullFormatter()) # trick to show multiple legends leg1 = ax.legend([l1], [l1.get_label()], loc=1) leg2 = ax.legend([l2, l3], (l2.get_label(), l3.get_label()), loc=2) ax.add_artist(leg1) ax.set_title('Skew $\Sigma$ and Kurtosis $K$') # next show distributions with different kurtosis ax = fig.add_subplot(212) x = np.linspace(-5, 5, 1000) l1, = ax.plot(x, stats.laplace(0, 1).pdf(x), '--k', label=r'${\rm Laplace,}\ K=+3$') l2, = ax.plot(x, stats.norm(0, 1).pdf(x), '-k', label=r'${\rm Gaussian,}\ K=0$') l3, = ax.plot(x, stats.cosine(0, 1).pdf(x), '-.k', label=r'${\rm Cosine,}\ K=-0.59$') l4, = ax.plot(x, stats.uniform(-2, 4).pdf(x), ':k', label=r'${\rm Uniform,}\ K=-1.2$') ax.set_xlim(-5, 5) ax.set_ylim(0, 0.55) ax.set_xlabel('$x$') ax.set_ylabel('$p(x)$') # trick to show multiple legends leg1 = ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=2)
def run(dataset, measurements, eps=1.0, delta=0.0, bounded=True, engine='MD', options={}, iters=10000, seed=None, metric='L2', elim_order=None, frequency=1, workload=None): """ Run a mechanism that measures the given measurements and runs inference. This is a convenience method for running end-to-end experiments. """ domain = dataset.domain total = None state = np.random.RandomState(seed) if len(measurements) >= 1 and type(measurements[0][0]) is str: matrix = lambda proj: sparse.eye(domain.project(proj).size()) measurements = [(proj, matrix(proj)) for proj in measurements] l1 = 0 l2 = 0 for _, Q in measurements: l1 += np.abs(Q).sum(axis=0).max() try: l2 += Q.power(2).sum(axis=0).max() # for spares matrices except: l2 += np.square(Q).sum(axis=0).max() # for dense matrices if bounded: total = dataset.df.shape[0] l1 *= 2 l2 *= 2 if delta > 0: noise = norm(loc=0, scale=np.sqrt(l2 * 2 * np.log(2 / delta)) / eps) else: noise = laplace(loc=0, scale=l1 / eps) if workload is None: workload = measurements truth = [] for proj, W, in workload: x = dataset.project(proj).datavector() y = W.dot(x) truth.append((W, y, proj)) answers = [] for proj, Q in measurements: x = dataset.project(proj).datavector() z = noise.rvs(size=Q.shape[0], random_state=state) y = Q.dot(x) answers.append((Q, y + z, 1.0, proj)) estimator = FactoredInference(domain, metric=metric, iters=iters, warm_start=False, elim_order=elim_order) logger = Logger(estimator, true_answers=truth, frequency=frequency) model = estimator.estimate(answers, total, engine=engine, callback=logger, options=options) return model, logger, answers
def case2(indexes=CASE_2_ATTRIBUTE_INDEXES,output=True): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i) #parameter estimation #but now we take 10 attributes into consideration sample_means_word_spam = list() sample_means_word_ham = list() sample_variances_word_spam = list() sample_variances_word_ham = list() for attr_index in indexes: sample_means_word_spam.append(nb.take_mean_spam(train_set,attr_index,SPAM_ATTR_INDEX)) sample_means_word_ham.append(nb.take_mean_ham(train_set,attr_index,SPAM_ATTR_INDEX)) sample_variances_word_spam.append(nb.take_variance_spam(train_set,attr_index,SPAM_ATTR_INDEX)) sample_variances_word_ham.append(nb.take_variance_ham(train_set,attr_index,SPAM_ATTR_INDEX)) #sample standard deviations from sample variances sample_std_devs_spam = map(lambda x: x ** (1/2.0), sample_variances_word_spam) sample_std_devs_ham = map(lambda x: x ** (1/2.0), sample_variances_word_ham) hits = 0.0 misses = 0.0 #number of instances correctly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # ou seja, o produto de todas as prob. condicionais das palavras dada a classe # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =) product_of_all_conditional_probs_spam = reduce(lambda acc,cur: acc * stats.laplace(sample_means_word_spam[cur], sample_std_devs_spam[cur]).pdf(row[indexes[cur]]) , xrange(10), 1) # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * product_of_all_conditional_probs_spam product_of_all_conditional_probs_ham = reduce(lambda acc,cur: acc * stats.laplace(sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf(row[indexes[cur]]) , xrange(10), 1) posterior_ham = prior_ham * product_of_all_conditional_probs_ham # whichever is greater - that will be our prediction if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if(row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1 ): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits/(hits+misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if(is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam/is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if(guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam/guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if(is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham/is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if(guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham/guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 2 - TEN ATTRIBUTES - USING LAPLACE MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: '+str(round(mean_accuracy,5)) print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5)) print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8)) print '' print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5)) print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5)) print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8)) print '' print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5)) print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5)) print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8)) print '' print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5)) print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5)) print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8)) print '' print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5)) print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5)) print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8))
import scipy.stats as st import numpy as np import matplotlib.pyplot as plt distributions = { 'normal': st.norm(loc=0, scale=1), 'laplace': st.laplace(loc=0, scale=1 / np.sqrt(2)), 'cauchy': st.cauchy(), 'uniform': st.uniform(loc=-np.sqrt(3), scale=2 * np.sqrt(3)), 'poisson': st.poisson(5) } def Zr(x): return (np.amin(x) + np.amax(x)) / 2 def Zq(x): return (np.quantile(x, 1 / 4) + np.quantile(x, 3 / 4)) / 2 def Ztr(x): n = x.size r = (int)(n / 4) sum1 = 0 for i in range(r, n - r): sum1 += x[i] return sum1 / (n - 2 * r) pos_characteristics = {
def _kstest(self, loc, scale, samples): # Uses the Kolmogorov-Smirnov test for goodness of fit. ks, _ = sp_stats.kstest(samples, sp_stats.laplace(loc, scale=scale).cdf) # Return True when the test passes. return ks < 0.02
import matplotlib.pyplot as plt import numpy as np from scipy.stats import laplace # Set true theta parameter theta = 5 # Define the random variable based on that f_Xi = laplace(loc=theta, scale=1) # Set sample size (n) and number of bootstrap iteration (B) n = 50 B = 100000 # Compute Cramér-Rao lower bound (CRLB) CRLB = 1 / n # Define theta_hat (which uses all bootstrap samples as an input and calculates B theta_hats) def theta_hats(X): return np.median(X, axis=0) # Calculate variance of theta_hat across bootstrap samples var_theta_hats = theta_hats(f_Xi.rvs(size=(n, B))).var() # Print the CRLB, variance of the theta_hats, and percentage deviance print('CRLB = ' + str(CRLB) + ', Var(theta_hat_MM) = ' + str(var_theta_hats) + r', % deviation: ' + str(np.abs(var_theta_hats - CRLB) / CRLB * 100))
plt.figure(3) plt.plot(support[ix], rv.pdf(support[ix]), label='Actual') plt.plot(support[ix], dens_normal.pdf()[ix], label='Scott') plt.plot(support[ix], dens_cvls.pdf()[ix], label='CV_LS') plt.plot(support[ix], dens_cvml.pdf()[ix], label='CV_ML') plt.title("Nonparametric Estimation of the Density of Pareto " \ "Distributed Random Variable") plt.legend(('Actual', 'Scott', 'CV_LS', 'CV_ML')) # Laplace Distribution mu = 0 s = 1 nobs = 250 support = np.random.laplace(mu, s, size=nobs) rv = stats.laplace(mu, s) ix = np.argsort(support) dens_normal = KDEMultivariate(data=[support], var_type='c', bw='normal_reference') dens_cvls = KDEMultivariate(data=[support], var_type='c', bw='cv_ls') dens_cvml = KDEMultivariate(data=[support], var_type='c', bw='cv_ml') plt.figure(4) plt.plot(support[ix], rv.pdf(support[ix]), label='Actual') plt.plot(support[ix], dens_normal.pdf()[ix], label='Scott') plt.plot(support[ix], dens_cvls.pdf()[ix], label='CV_LS') plt.plot(support[ix], dens_cvml.pdf()[ix], label='CV_ML') plt.title("Nonparametric Estimation of the Density of Laplace " \ "Distributed Random Variable") plt.legend(('Actual', 'Scott', 'CV_LS', 'CV_ML'))
import scipy.stats as st import numpy as np norm = st.norm(loc=0, scale=1) N_laplace = 25 laplace = st.laplace(loc=0, scale=1 / np.sqrt(2)) x_laplace = laplace.rvs(N_laplace) file = open('laplace2.txt', 'w') m = x_laplace.mean() sigma = x_laplace.std() file.write('m = ' + str(m) + 'sigma = ' + str(sigma) + '\n') k = int(1.72 * np.cbrt(N_laplace)) if k % 2 == 1: k -= 1 left = -0.2 right = 0.2 step = (right - left) / (k - 2) delta = [left + i * step for i in range(0, k - 1)] n = np.zeros(k) for r in x_laplace: last = True for i in range(0, len(delta)): if r < delta[i]: n[i] += 1 last = False
# "Statistics, Data Mining, and Machine Learning in Astronomy" (2013) # For more information, see http://astroML.github.com import numpy as np from scipy.stats import laplace from matplotlib import pyplot as plt #------------------------------------------------------------ # Define the distribution parameters to be plotted delta_values = [0.5, 1.0, 2.0] linestyles = ['-', '--', ':'] mu = 0 x = np.linspace(-10, 10, 1000) #------------------------------------------------------------ # plot the distributions for delta, ls in zip(delta_values, linestyles): dist = laplace(mu, delta) plt.plot(x, dist.pdf(x), ls=ls, c='black', label=r'$\mu=%i,\ \Delta=%.1f$' % (mu, delta), lw=2) plt.xlim(-7, 7) plt.ylim(0, 1.1) plt.xlabel('$x$', fontsize=14) plt.ylabel(r'$P(x|\mu,\Delta)$', fontsize=14) plt.title('Laplace Distribution') plt.legend() plt.show()
ax.set_xlim(-5, 5) ax.set_ylim(0, 0.7001) ax.set_ylabel('$p(x)$') ax.xaxis.set_major_formatter(plt.NullFormatter()) # trick to show multiple legends leg1 = ax.legend([l1], [l1.get_label()], loc=1) leg2 = ax.legend([l2, l3], (l2.get_label(), l3.get_label()), loc=2) ax.add_artist(leg1) ax.set_title('Skew $\Sigma$ and Kurtosis $K$') # next show distributions with different kurtosis ax = fig.add_subplot(212) x = np.linspace(-5, 5, 1000) l1, = ax.plot(x, stats.laplace(0, 1).pdf(x), '--k', label=r'${\rm Laplace,}\ K=+3$') l2, = ax.plot(x, stats.norm(0, 1).pdf(x), '-k', label=r'${\rm Gaussian,}\ K=0$') l3, = ax.plot(x, stats.cosine(0, 1).pdf(x), '-.k', label=r'${\rm Cosine,}\ K=-0.59$') l4, = ax.plot(x, stats.uniform(-2, 4).pdf(x), ':k', label=r'${\rm Uniform,}\ K=-1.2$')
def case3(output=True): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i) #parameter estimation #but now we take ALL attributes into consideration sample_means_word_spam = list() sample_means_word_ham = list() sample_variances_word_spam = list() sample_variances_word_ham = list() # all but the last one for attr_index in xrange(57): sample_means_word_spam.append(nb.take_mean_spam(train_set,attr_index,SPAM_ATTR_INDEX)) sample_means_word_ham.append(nb.take_mean_ham(train_set,attr_index,SPAM_ATTR_INDEX)) sample_variances_word_spam.append(nb.take_variance_spam(train_set,attr_index,SPAM_ATTR_INDEX)) sample_variances_word_ham.append(nb.take_variance_ham(train_set,attr_index,SPAM_ATTR_INDEX)) #sample standard deviations from sample variances sample_std_devs_spam = map(lambda x: x ** (1/2.0), sample_variances_word_spam) sample_std_devs_ham = map(lambda x: x ** (1/2.0), sample_variances_word_ham) hits = 0.0 misses = 0.0 #number of instances correctly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # ou seja, o produto de todas as prob. condicionais das palavras dada a classe # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =) product_of_all_conditional_probs_spam = reduce(lambda acc,cur: acc * stats.laplace(sample_means_word_spam[cur], sample_std_devs_spam[cur]).pdf(row[CASE_2_ATTRIBUTE_INDEXES[cur]]) , xrange(10), 1) # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * product_of_all_conditional_probs_spam product_of_all_conditional_probs_ham = reduce(lambda acc,cur: acc * stats.laplace(sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf(row[CASE_2_ATTRIBUTE_INDEXES[cur]]) , xrange(10), 1) posterior_ham = prior_ham * product_of_all_conditional_probs_ham # whichever is greater - that will be our prediction if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if(row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1 ): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits/(hits+misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if(is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam/is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if(guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam/guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if(is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham/is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if(guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham/guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 3 - ALL ATTRIBUTES - USING LAPLACE MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: '+str(round(mean_accuracy,5)) print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5)) print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8)) print '' print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5)) print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5)) print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8)) print '' print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5)) print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5)) print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8)) print '' print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5)) print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5)) print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8)) print '' print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5)) print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5)) print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8))
def CI_sampler_regressor(X_in, Y_in, Z_in, train_len=-1, nthread=4, max_depth=6, colsample_bytree=0.8, n_estimators=200, noise='Normal', perc=0.3): np.random.seed(11) assert (type(X_in) == np.ndarray), "Not an array" assert (type(Y_in) == np.ndarray), "Not an array" assert (type(Z_in) == np.ndarray), "Not an array" nx, dx = X_in.shape ny, dy = Y_in.shape nz, dz = Z_in.shape assert (nx == ny), "Dimension Mismatch" assert (nz == ny), "Dimension Mismatch" assert (nx == nz), "Dimension Mismatch" samples = np.hstack([X_in, Y_in, Z_in]).astype(np.float32) if train_len == -1: train_len = 2 * len(X_in) / 3 assert (train_len < nx), "Training length cannot be larger than total length" data1 = samples[0:nx / 2, :] data2 = samples[nx / 2::, :] multioutputregressor = MultiOutputRegressor( estimator=xgb.XGBRegressor(objective='reg:linear', max_depth=max_depth, colsample_bytree=1.0, n_estimators=n_estimators, nthread=nthread)) Xset = range(0, dx) Yset = range(dx, dx + dy) Zset = range(dx + dy, dx + dy + dz) X1, Y1, Z1 = data1[:, Xset], data1[:, Yset], data1[:, Zset] X2, Y2, Z2 = data2[:, Xset], data2[:, Yset], data2[:, Zset] if noise == 'Normal': MOR = multioutputregressor.fit(Z1, Y1) Y1hat = MOR.predict(Z1) cov = np.cov(np.transpose(Y1hat - Y1)) print('Calculated Covariance: ') print(cov) Yprime = MOR.predict(Z2) n2, n22 = data2.shape try: m1, m2 = cov.shape Nprime = np.random.multivariate_normal(np.zeros(m1), cov, size=n2) except: m1 = 1 Nprime = np.random.normal(scale=np.sqrt(cov), size=[n2, 1]) elif noise == 'Laplace': MOR = multioutputregressor.fit(Z1, Y1) Y1hat = MOR.predict(Z1) E = Y1 - Y1hat Yprime = MOR.predict(Z2) n2, n22 = data2.shape p, q = E.shape s = np.std(E[:, 0]) L = laplace() r = L.rvs(size=(n2, 1)) s2 = np.std(r) r = (s / s2) * r Nprime = r for l in range(1, q): s = np.std(E[:, l]) L = laplace() r = L.rvs(size=(n2, 1)) s2 = np.std(r) r = (s / s2) * r Nprime = np.vstack((Nprime, r)) elif noise == 'Mixture': MOR = multioutputregressor.fit(Z1, Y1) Y1hat = MOR.predict(Z1) cov = np.cov(np.transpose(Y1hat - Y1)) print('Calculated Covariance: ') print(cov) Yprime = MOR.predict(Z2) n2, n22 = data2.shape try: m1, m2 = cov.shape Nprime = np.random.multivariate_normal(np.zeros(m1), cov, size=n2) except: m1 = 1 NprimeG = np.random.normal(scale=np.sqrt(cov), size=[n2, 1]) MOR = multioutputregressor.fit(Z1, Y1) Y1hat = MOR.predict(Z1) E = Y1 - Y1hat Yprime = MOR.predict(Z2) n2, n22 = data2.shape p, q = E.shape s = np.std(E[:, 0]) L = laplace() r = L.rvs(size=(n2, 1)) s2 = np.std(r) r = (s / s2) * r Nprime = r for l in range(1, q): s = np.std(E[:, l]) L = laplace() r = L.rvs(size=(n2, 1)) s2 = np.std(r) r = (s / s2) * r Nprime = np.vstack((Nprime, r)) indices = np.random.choice(p, size=int(perc * p), replace=False) Nprime[indices, :] = NprimeG[indices, :] else: assert False, 'Not Implemented Error' yprime = Yprime + Nprime data2_new = np.hstack([X2, yprime, Z2]) y1 = np.ones([len(data1), 1]) y2 = np.zeros([len(data2_new), 1]) at1 = np.hstack([data1, y1]) at2 = np.hstack([data2_new, y2]) all_train = np.vstack([at1, at2]) shuffle = np.random.permutation(len(all_train)) data_final = all_train[shuffle, :] l, m = data_final.shape Xdata = data_final[:, 0:m - 1] Ydata = data_final[:, m - 1] Xtrain = Xdata[0:train_len, :] Ytrain = Ydata[0:train_len] Xtest = Xdata[train_len::, :] Ytest = Ydata[train_len::] return Xtrain, Ytrain, Xtest, Ytest
plt.axis([-20, 20, 0, 0.10]) plt.text(-18,0.08,'n=10000') plt.subplot(2,2,4) x = rv.rvs(size=100000) n, bins, patches = plt.hist(x, 20, normed=1, facecolor='magenta', alpha=0.5) plt.plot(x1, y1, 'r', lw=3) plt.xlabel('X', fontsize=15) plt.ylabel('PDF', fontsize=15) plt.axis([-20, 20, 0, 0.10]) plt.text(-18,0.08,'n=100000') plt.savefig('/home/tomer/my_books/python_in_hydrology/images/rand_theo.png') # LAPLACE DISTRIBUION rv = st.laplace(loc=0, scale=15) x1 = np.linspace(-100, 100, 1000) y1 = rv.pdf(x1) # compute and plot pdf plt.clf() fig = plt.figure() fig.subplots_adjust(wspace=0.4) plt.subplot(2,2,1) x = rv.rvs(size=100) n, bins, patches = plt.hist(x, 20, normed=1, facecolor='yellow', alpha=0.5) plt.plot(x1, y1, 'r', lw=3, label='scale=5') plt.xlabel('X', fontsize=15) plt.ylabel('PDF', fontsize=15)
def plot_result(color, distrib, x_a=-4, x_b=4): x = 0 y = 0 x_dist = np.linspace(x_a, x_b, 3000) if distrib == "Standard normal": y = st.norm.cdf(x_dist, 0, 1) y_p = st.norm.pdf(x_dist, 0, 1) x = st.norm(loc=0., scale=1.) elif distrib == "Uniform": y_p = st.uniform.pdf(x_dist, -3**0.5, 2 * (3**0.5)) y = st.uniform.cdf(x_dist, -3**0.5, 2 * (3**0.5)) x = st.uniform(loc=-3**0.5, scale=2 * (3**0.5)) elif distrib == "Cauchy": y = st.cauchy.cdf(x_dist, 0, 1) y_p = st.cauchy.pdf(x_dist, 0, 1) x = st.cauchy(loc=0, scale=1) elif distrib == "Laplace": y = st.laplace.cdf(x_dist, 0, (2**(-0.5))) y_p = st.laplace.pdf(x_dist, 0, (2**(-0.5))) x = st.laplace(loc=0, scale=(2**(-0.5))) elif distrib == "Poisson": y = st.poisson.cdf(x_dist, mu=2) y_p = np.exp(-2) * np.power(2, x_dist) / factorial(x_dist) x = st.poisson(mu=2) for i in n_vec: sample_x = np.sort(x.rvs(i)) cdf = ecdf(sample_x) plt.step(sample_x, cdf, color="darkslategrey", label="Empirical distribution function") plt.plot(x_dist, y, color=color) plt.legend() plt.xlim(x_a, x_b) plt.tight_layout() plt.show() for i in n_vec: sample_x = np.sort(x.rvs(i)) fig = plt.figure(num="ker_" + str(distrib) + "_" + str(i), figsize=(11, 4)) ind = 1 for h in [1, 3, 5]: ax = fig.add_subplot(1, 3, ind) sns.kdeplot(sample_x, color="darkslategrey", bw=h, ax=ax, label=("Kernel density estimation" if h == 1 else "")) if h == 5: ax.legend(["Kernel density estimation"], loc="upper center", bbox_to_anchor=(0.5, -0.25)) plt.plot(x_dist, y_p, color=color) plt.xlim(x_a, x_b) plt.title("h = " + str(h)) ind = ind + 1 plt.tight_layout() plt.show()
from scipy.stats import norm, cauchy, laplace, poisson, uniform import numpy as np import matplotlib.pyplot as plt def quart(list, p): idx = int(np.ceil(len(list) * p)) return list[idx] sampleSizes = [20, 100] distrs = [(norm(0, 1), "normal"), (cauchy(0, 1), "cauchy"), (laplace(0, 1 / np.sqrt(2)), "laplace"), (poisson(10), "poisson"), (uniform(-np.sqrt(3), 2 * np.sqrt(3)), "uniform")] for distr in distrs: fig, axs = plt.subplots(2) for i in range(len(sampleSizes)): rvs = distr[0].rvs(sampleSizes[i]) rvs.sort() axs[i].boxplot(rvs, vert=False) axs[i].set_ylabel(str(sampleSizes[i]), fontsize=8) fig.savefig('Boxplots/' + distr[1] + '.png') table = open("TablesLab3_1.tex", 'w', encoding="utf-8") table.writelines("\\begin{table}[h]\n" "\centering\n" "\\begin{tabular}{ |" + "c|" * 2 + " }\n" "\hline\n" "Выборка & Доля выбросов \\\\\n"
ax.set_xlim(-5, 5) ax.set_ylim(0, 0.7001) ax.set_ylabel("$p(x)$", fontsize=16) ax.xaxis.set_major_formatter(plt.NullFormatter()) # trick to show multiple legends leg1 = ax.legend([l1], [l1.get_label()], loc=1) leg2 = ax.legend([l2, l3], (l2.get_label(), l3.get_label()), loc=2) ax.add_artist(leg1) ax.set_title("Skew $\Sigma$ and Kurtosis $K$") # next show distributions with different kurtosis ax = fig.add_subplot(212) x = np.linspace(-5, 5, 1000) l1, = ax.plot(x, stats.laplace(0, 1).pdf(x), "--k", label=r"${\rm Laplace,}\ K=+3$") l2, = ax.plot(x, stats.norm(0, 1).pdf(x), "-k", label=r"${\rm Gaussian,}\ K=0$") l3, = ax.plot(x, stats.cosine(0, 1).pdf(x), "-.k", label=r"${\rm Cosine,}\ K=-0.59$") l4, = ax.plot(x, stats.uniform(-2, 4).pdf(x), ":k", label=r"${\rm Uniform,}\ K=-1.2$") ax.set_xlim(-5, 5) ax.set_ylim(0, 0.6001) ax.set_xlabel("$x$", fontsize=16) ax.set_ylabel("$p(x)$", fontsize=16) # trick to show multiple legends leg1 = ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=2) leg2 = ax.legend((l3, l4), (l3.get_label(), l4.get_label()), loc=1) ax.add_artist(leg1) plt.show()
def all_dists(): # dists param were taken from scipy.stats official # documentaion examples # Total - 89 return { "alpha": stats.alpha(a=3.57, loc=0.0, scale=1.0), "anglit": stats.anglit(loc=0.0, scale=1.0), "arcsine": stats.arcsine(loc=0.0, scale=1.0), "beta": stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0), "betaprime": stats.betaprime(a=5, b=6, loc=0.0, scale=1.0), "bradford": stats.bradford(c=0.299, loc=0.0, scale=1.0), "burr": stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0), "cauchy": stats.cauchy(loc=0.0, scale=1.0), "chi": stats.chi(df=78, loc=0.0, scale=1.0), "chi2": stats.chi2(df=55, loc=0.0, scale=1.0), "cosine": stats.cosine(loc=0.0, scale=1.0), "dgamma": stats.dgamma(a=1.1, loc=0.0, scale=1.0), "dweibull": stats.dweibull(c=2.07, loc=0.0, scale=1.0), "erlang": stats.erlang(a=2, loc=0.0, scale=1.0), "expon": stats.expon(loc=0.0, scale=1.0), "exponnorm": stats.exponnorm(K=1.5, loc=0.0, scale=1.0), "exponweib": stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0), "exponpow": stats.exponpow(b=2.7, loc=0.0, scale=1.0), "f": stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0), "fatiguelife": stats.fatiguelife(c=29, loc=0.0, scale=1.0), "fisk": stats.fisk(c=3.09, loc=0.0, scale=1.0), "foldcauchy": stats.foldcauchy(c=4.72, loc=0.0, scale=1.0), "foldnorm": stats.foldnorm(c=1.95, loc=0.0, scale=1.0), # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0), # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0), "genlogistic": stats.genlogistic(c=0.412, loc=0.0, scale=1.0), "genpareto": stats.genpareto(c=0.1, loc=0.0, scale=1.0), "gennorm": stats.gennorm(beta=1.3, loc=0.0, scale=1.0), "genexpon": stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0), "genextreme": stats.genextreme(c=-0.1, loc=0.0, scale=1.0), "gausshyper": stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0), "gamma": stats.gamma(a=1.99, loc=0.0, scale=1.0), "gengamma": stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0), "genhalflogistic": stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0), "gilbrat": stats.gilbrat(loc=0.0, scale=1.0), "gompertz": stats.gompertz(c=0.947, loc=0.0, scale=1.0), "gumbel_r": stats.gumbel_r(loc=0.0, scale=1.0), "gumbel_l": stats.gumbel_l(loc=0.0, scale=1.0), "halfcauchy": stats.halfcauchy(loc=0.0, scale=1.0), "halflogistic": stats.halflogistic(loc=0.0, scale=1.0), "halfnorm": stats.halfnorm(loc=0.0, scale=1.0), "halfgennorm": stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0), "hypsecant": stats.hypsecant(loc=0.0, scale=1.0), "invgamma": stats.invgamma(a=4.07, loc=0.0, scale=1.0), "invgauss": stats.invgauss(mu=0.145, loc=0.0, scale=1.0), "invweibull": stats.invweibull(c=10.6, loc=0.0, scale=1.0), "johnsonsb": stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0), "johnsonsu": stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0), "ksone": stats.ksone(n=1e03, loc=0.0, scale=1.0), "kstwobign": stats.kstwobign(loc=0.0, scale=1.0), "laplace": stats.laplace(loc=0.0, scale=1.0), "levy": stats.levy(loc=0.0, scale=1.0), "levy_l": stats.levy_l(loc=0.0, scale=1.0), "levy_stable": stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0), "logistic": stats.logistic(loc=0.0, scale=1.0), "loggamma": stats.loggamma(c=0.414, loc=0.0, scale=1.0), "loglaplace": stats.loglaplace(c=3.25, loc=0.0, scale=1.0), "lognorm": stats.lognorm(s=0.954, loc=0.0, scale=1.0), "lomax": stats.lomax(c=1.88, loc=0.0, scale=1.0), "maxwell": stats.maxwell(loc=0.0, scale=1.0), "mielke": stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0), "nakagami": stats.nakagami(nu=4.97, loc=0.0, scale=1.0), "ncx2": stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0), "ncf": stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0), "nct": stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0), "norm": stats.norm(loc=0.0, scale=1.0), "pareto": stats.pareto(b=2.62, loc=0.0, scale=1.0), "pearson3": stats.pearson3(skew=0.1, loc=0.0, scale=1.0), "powerlaw": stats.powerlaw(a=1.66, loc=0.0, scale=1.0), "powerlognorm": stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0), "powernorm": stats.powernorm(c=4.45, loc=0.0, scale=1.0), "rdist": stats.rdist(c=0.9, loc=0.0, scale=1.0), "reciprocal": stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0), "rayleigh": stats.rayleigh(loc=0.0, scale=1.0), "rice": stats.rice(b=0.775, loc=0.0, scale=1.0), "recipinvgauss": stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0), "semicircular": stats.semicircular(loc=0.0, scale=1.0), "t": stats.t(df=2.74, loc=0.0, scale=1.0), "triang": stats.triang(c=0.158, loc=0.0, scale=1.0), "truncexpon": stats.truncexpon(b=4.69, loc=0.0, scale=1.0), "truncnorm": stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0), "tukeylambda": stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0), "uniform": stats.uniform(loc=0.0, scale=1.0), "vonmises": stats.vonmises(kappa=3.99, loc=0.0, scale=1.0), "vonmises_line": stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0), "wald": stats.wald(loc=0.0, scale=1.0), "weibull_min": stats.weibull_min(c=1.79, loc=0.0, scale=1.0), "weibull_max": stats.weibull_max(c=2.87, loc=0.0, scale=1.0), "wrapcauchy": stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0), }
def _kstest(self, loc, scale, samples): # Uses the Kolmogorov-Smirnov test for goodness of fit. ks, _ = stats.kstest(samples, stats.laplace(loc, scale=scale).cdf) # Return True when the test passes. return ks < 0.02
sstot = np.sum((y - ybar)**2) r2 = ssreg / sstot plt.plot(x_n, ffit, label='order {}, $R^2$= {:.2f}'.format(i, r2)) plt.legend(loc=2, fontsize=14) plt.xlabel('$x$', fontsize=14) plt.ylabel('$y$', fontsize=14, rotation=0) plt.savefig('img602.png', dpi=300, figsize=[5.5, 5.5]) plt.figure() plt.figure(figsize=(8, 6)) x_values = np.linspace(-10, 10, 300) for df in [1, 2, 5, 15]: distri = stats.laplace(scale=df) x_pdf = distri.pdf(x_values) plt.plot(x_values, x_pdf, label='$b$ = {}'.format(df)) x_pdf = stats.norm.pdf(x_values) plt.plot(x_values, x_pdf, label='Gaussian') plt.xlabel('x') plt.ylabel('p(x)', rotation=0) plt.legend(loc=0, fontsize=14) plt.xlim(-7, 7) plt.savefig('img603.png', dpi=300, figsize=[5.5, 5.5]) plt.figure() x_1 = np.array([10., 8., 13., 9., 11., 14., 6., 4., 12., 7., 5.]) y_1 = np.array(
def prepare_widgets(): print "initializing..." # start bokeh-server session global client client = Session(root_url='http://0.0.0.0:7010/', load_from_config=False) try: client.register(bs_login,bs_password) except:pass client.login(bs_login,bs_password) ###CREATE WIDGETS print "preaparing widgets..." #hist1: hist with overlay import analysis.distfit as distfit import pandas as pd xname,xmin,xmax,xbins = "invariantMass",0,10,50 bin_separators = np.histogram([],bins=xbins, range=[xmin,xmax])[1] bin_centers = np.array([0.5*(bin_separators[i]+bin_separators[i+1]) for i in range(len(bin_separators)-1)]) bins = pd.DataFrame({"x":bin_centers}) expo_gap = lambda x,slope: (x>=xmin)*(x<=xmax)*distfit.exponential(x-xmin,slope)/(1.-np.e**(-(xmax-xmin)*slope)) mix_model = distfit.DistributionsMixture( distributions={'sig': distfit.gauss, 'bck': expo_gap}, weights_ranges={'sig': [1.,10.], 'bck': [1.,10.]}, parameter_ranges={'mean': [xmin ,xmax], 'sigma': [0., xmax-xmin], 'slope': [0, 15.]}, column_ranges={'x': [xmin, xmax]}, sampling_strategy='grid', ) mix_model.compile(bins,1000) #takes several seconds hist1_base = WhiskeredHistWidget(xname,xmin,xmax,xbins,es, fig = figure(plot_width=600, plot_height=600,tools=['wheel_zoom','ywheel_zoom','pan','resize','reset'])) hist1 = MLFitOverlayWidget(hist1_base,mix_model,n_pts=100) widgets.append(hist1) #hist2: just hist hist2 = ClassicHistWidget("muonHits",0,100,30,es, fig = figure(plot_width=600, plot_height=600,tools=['wheel_zoom','ywheel_zoom','pan','reset'])) widgets.append(hist2) #hist3: heatmap hist3 = HeatmapWidget("avgMass",0,35,50, "muonHits",0,70,50, es,fig = figure(plot_width=600, plot_height=600),) widgets.append(hist3) #hist4: hist with reference hist4_base = ClassicHistWidget("dskkpi",1920,2020,30,es, fig = figure(plot_width=600, plot_height=600,tools=['wheel_zoom','ywheel_zoom','pan','reset'])) from scipy.stats import laplace pdf = laplace(1970,7).pdf hist4 = ReferenceOverlay(hist4_base,pdf) widgets.append(hist4) ###end CREATE PLOTS print "publishing plots..." #create a dashboard on bokeh_server output_server(dashboard_name,client) plots = [ hplot(widget.fig) for widget in widgets ] global whole_dashboard whole_dashboard = vplot(hplot(*plots[:2]),hplot(*plots[2:])) plots.append(whole_dashboard) for plot in plots: client.show(plot) client.publish() print "creating static links..." #publish the thing from bokeh.embed import autoload_server scripts = [autoload_server(plot,client,public=True) for plot in plots] print "saving widget scripts..." #remove previous widgets for path_to_static in path_to_django_static,path_to_flask_static: path_to_widgets = os.path.join(path_to_static,dashboard_name) os.system("rm -rf " + path_to_widgets) os.mkdir(path_to_widgets) for i, source_script in enumerate(scripts): #convert script... script = assemble_script("widget"+str(i),source_script) with open("{}/widget{}.html".format(path_to_widgets,i),'w') as fscript: fscript.write(script) print "dashboard {} ready.".format(dashboard_name),