Пример #1
0
def test_lasso(s=5, n=100, p=50):
    
    X, y, _, nonzero, sigma = instance(n=n, p=p, random_signs=True, s=s, sigma=1.)
    lam_frac = 1.

    randomization = laplace(loc=0, scale=1.)
    loss = randomized.gaussian_Xfixed(X, y)
    random_Z = randomization.rvs(p)
    epsilon = 1.

    lam = sigma * lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0))

    random_Z = randomization.rvs(p)
    penalty = randomized.selective_l1norm(p, lagrange=lam)

    sampler1 = randomized.selective_sampler_MH(loss,
                                               random_Z,
                                               epsilon,
                                               randomization,
                                               penalty)

    loss_args = {'mean':np.zeros(n), 
                 'sigma':sigma}
    null, alt = pval(sampler1, 
                     loss_args,
                     X, y,
                     nonzero)
    
    return null, alt
Пример #2
0
def main(rho=0.245, n=100, p=30):
    
    X, prec, nonzero = instance(n=n, p=p, alpha=0.99, rho=rho)
    lam_frac = 0.1
    alpha = 0.8

    randomization = laplace(loc=0, scale=1.)
    loss = randomized.neighbourhood_selection(X) 
    epsilon = 1.

    lam = 2./np.sqrt(n) * np.linalg.norm(X) * norm.isf(alpha / (2 * p**2))

    random_Z = randomization.rvs(p**2 - p)
    penalty = randomized.selective_l1norm(p**2-p, lagrange=lam)

    sampler1 = randomized.selective_sampler_MH(loss,
                                               random_Z,
                                               epsilon,
                                               randomization,
                                               penalty)

    loss_args = {"active":sampler1.penalty.active_set,
                 "quadratic_coef":epsilon}
    null, alt = pval(sampler1, 
                     loss_args,
                     None, X,
                     nonzero)
    
    return null, alt
Пример #3
0
def test_logistic(s=5, n=200, p=20):
    
    X, y, beta, active= logistic_instance(n=n, p=p, s=s, rho=0)
    nonzero = np.where(beta)[0]
    lam_frac = 40.8

    randomization = laplace(loc=0, scale=1.)
    loss = randomized.logistic_Xrandom(X, y)
    epsilon = 1.

    #lam = lam_frac * np.mean(np.fabs(np.dot(X.T, (np.random.binomial(1, 1./2, (n, 10000)) - 0.5))).max(0))
    lam = 70.
    random_Z = randomization.rvs(p)
    penalty = randomized.selective_l1norm(p, lagrange=lam)

    sampler1 = randomized.selective_sampler_MH(loss,
                                               random_Z,
                                               epsilon,
                                               randomization,
                                               penalty)

    sampler1.loss.fit_E(sampler1.penalty.active_set)
    linear_part = np.identity(p)
    data = np.dot(X.T, y - 1./2)

    loss_args = {'mean':np.zeros(p)}

    null, alt = pval(sampler1,
                     loss_args,
                     linear_part,
                     data, 
                     nonzero)
    
    return null, alt
Пример #4
0
 def set_param_vec(self, params):
     assert len(params) == 2, "Laplace Marginal Distribution requires exactly 2 parameters: np.array([mu, b])"
     self.mu = params[0]
     self.b = params[1]
     self.param_vec[0] = self.mu
     self.param_vec[1] = self.b
     assert self.b>0., "b cannot be <=0."
     self.laplace_obj = laplace(self.mu, self.b)
     return True
Пример #5
0
  def _test_grid_log(self, dtype, scipy_dtype, grid_spec, error_spec):
    with self.test_session():
      grid = _make_grid(dtype, grid_spec)
      actual = sm.log_cdf_laplace(grid).eval()

      # Basic tests.
      # isfinite checks for NaN and Inf.
      self.assertAllTrue(np.isfinite(actual))
      self.assertAllTrue((actual < 0))
      _check_strictly_increasing(actual)

      # Versus scipy.
      scipy_dist = stats.laplace(loc=0., scale=1.)
      expected = scipy_dist.logcdf(grid.astype(scipy_dtype))
      self.assertAllClose(
          expected.astype(np.float64),
          actual.astype(np.float64),
          rtol=error_spec.rtol,
          atol=error_spec.atol)
Пример #6
0
    ax[idx].vlines(locs, 0, w, color='C0')
    ax[idx].set_title('α = {}'.format(α))

plt.tight_layout()
plt.show()

# %%
α = 10
H = stats.norm
K = 5

x = np.linspace(-4, 4, 250)
x_ = np.array([x] * K).T
locs, w = stick_breaking_truncated(α, H, K)

dist = stats.laplace(locs, 0.5)
plt.plot(x, np.sum(dist.pdf(x_) * w, 1), 'C0', lw=2)
plt.plot(x, dist.pdf(x_) * w, 'k--', alpha=0.7)
plt.yticks([])
plt.show()

# %%
N = cs_exp.shape[0]
K = 20


def stick_breaking(α, K):
    β = pm.Beta('β', 1., α, shape=K)
    w = β * pm.math.concatenate([[1.], tt.extra_ops.cumprod(1. - β)[:-1]])
    return w
Пример #7
0
#!/usr/bin/env python

# Run this code with
# python laplace_hong.py

from scipy import stats
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# Generate a Laplace distribution with mu = 0, and delta = 1.
# Compute and print out the first few moments.
dist = stats.laplace(0, 1.0)
mean, var, skew, kurt = dist.stats(moments='mvsk')
print "Laplace distribution with mu = 0 and delta = 1.0: "
print "mean =", mean[()]
print "variance =", var[()]
print "skew =", skew[()]
print "kurtosis =", kurt[()], "\n"

# Four random draws
# Calculate and print out the mean and variance
# Save each draw in a file
N_arr = np.array([10, 100, 1000, 10000]) 
r = [[],[],[],[]]
for i in np.arange(0, len(N_arr)):
	N = N_arr[i]
	r[i] = dist.rvs(N)
	print "%d Random Samples from Laplace Distribution:" % N
	print "mean =", r[i].mean() 
	print "variance =", r[i].var(), "\n"
Пример #8
0
# coding:utf8
import numpy as np
from scipy import stats
from scipy.special import gamma
import matplotlib.pyplot as plt
from pathlib import Path
import pickle

np.random.seed(0)
root = Path('./ass2/savedoc')
if not root.is_dir():
    root.mkdir()

C = np.sqrt(np.pi / 2) / gamma(1.5)
t2 = stats.t(2)
laplace = stats.laplace()
plt.switch_backend('agg')

# part (b)
x = np.linspace(-10, 10, 1000)
plt.figure(figsize=(20, 10))
plt.title("t2 distribution with C vs laplace distribution")
plt.plot(x, laplace.pdf(x), 'r-', label='laplace distribution')
plt.plot(x, C * t2.pdf(x), 'g--', label='t2 distribution with C')
plt.legend()
plt.savefig(root / 'p2b.jpg')
plt.close()

# part (c)
num = 10000
allnum = int(5e4)
Пример #9
0
def test_lasso(s=0,
               n=100,
               p=20,
               weights="neutral",
               randomization_dist="logistic",
               randomization_scale=1,
               Langevin_steps=10000,
               burning=2000,
               X_scaled=True,
               covariance_estimate="nonparametric",
               noise="uniform"):
    """ weights: exponential, gamma, normal, gumbel
    randomization_dist: logistic, laplace """

    step_size = 1. / p

    X, y, true_beta, nonzero, sigma = instance(n=n,
                                               p=p,
                                               random_signs=True,
                                               s=s,
                                               sigma=1.,
                                               rho=0,
                                               scale=X_scaled,
                                               noise=noise)
    print 'true beta', true_beta
    lam_frac = 1.

    if randomization_dist == "laplace":
        randomization = laplace(loc=0, scale=1.)
        random_Z = randomization.rvs(p)
    if randomization_dist == "logistic":
        random_Z = np.random.logistic(loc=0, scale=1, size=p)
    if randomization_dist == "normal":
        random_Z = np.random.standard_normal(p)

    print 'randomization', random_Z * randomization_scale
    loss = lasso_randomX.lasso_randomX(X, y)

    epsilon = 1. / np.sqrt(n)
    #epsilon = 1.
    lam = sigma * lam_frac * np.mean(
        np.fabs(
            np.dot(X.T, np.random.standard_normal((n, 10000))) +
            randomization_scale * np.random.logistic(size=(p, 10000))).max(0))

    lam_scaled = lam.copy()
    random_Z_scaled = random_Z.copy()
    epsilon_scaled = epsilon

    if (X_scaled == False):
        random_Z_scaled *= np.sqrt(n)
        lam_scaled *= np.sqrt(n)
        epsilon_scaled *= np.sqrt(n)

    penalty = randomized.selective_l1norm_lan(p, lagrange=lam_scaled)

    # initial solution

    problem = rr.simple_problem(loss, penalty)

    random_term = rr.identity_quadratic(epsilon_scaled, 0,
                                        -randomization_scale * random_Z_scaled,
                                        0)
    solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500}
    initial_soln = problem.solve(random_term, **solve_args)
    print 'initial solution', initial_soln

    active = (initial_soln != 0)
    if np.sum(active) == 0:
        return [-1], [-1]
    inactive = ~active
    betaE = initial_soln[active]
    signs = np.sign(betaE)

    initial_grad = -np.dot(X.T, y - np.dot(X, initial_soln))
    if (X_scaled == False):
        initial_grad /= np.sqrt(n)
    print 'initial_gradient', initial_grad
    subgradient = random_Z - initial_grad - epsilon * initial_soln
    cube = np.divide(subgradient[inactive], lam)

    nactive = betaE.shape[0]
    ninactive = cube.shape[0]

    beta_unpenalized = np.linalg.lstsq(X[:, active], y)[0]
    print 'beta_OLS onto E', beta_unpenalized
    obs_residuals = y - np.dot(X[:, active],
                               beta_unpenalized)  # y-X_E\bar{\beta}^E
    N = np.dot(X[:, inactive].T,
               obs_residuals)  # X_{-E}^T(y-X_E\bar{\beta}_E), null statistic
    full_null = np.zeros(p)
    full_null[nactive:] = N

    # parametric coveriance estimate
    if covariance_estimate == "parametric":
        XE_pinv = np.linalg.pinv(X[:, active])
        mat = np.zeros((nactive + ninactive, n))
        mat[:nactive, :] = XE_pinv
        mat[nactive:, :] = X[:, inactive].T.dot(
            np.identity(n) - X[:, active].dot(XE_pinv))
        Sigma_full = mat.dot(mat.T)
    else:
        Sigma_full = bootstrap_covariance(X, y, active, beta_unpenalized)

    init_vec_state = np.zeros(n + nactive + ninactive)
    if weights == "exponential":
        init_vec_state[:n] = np.ones(n)
    else:
        init_vec_state[:n] = np.zeros(n)

    #init_vec_state[:n] = np.random.standard_normal(n)
    #init_vec_state[:n] = np.ones(n)
    init_vec_state[n:(n + nactive)] = betaE
    init_vec_state[(n + nactive):] = cube

    def full_projection(vec_state,
                        signs=signs,
                        nactive=nactive,
                        ninactive=ninactive):

        alpha = vec_state[:n].copy()
        betaE = vec_state[n:(n + nactive)].copy()
        cube = vec_state[(n + nactive):].copy()

        projected_alpha = alpha.copy()
        projected_betaE = betaE.copy()
        projected_cube = np.zeros_like(cube)

        if weights == "exponential":
            projected_alpha = np.clip(alpha, 0, np.inf)

        if weights == "gamma":
            projected_alpha = np.clip(alpha, -2 + 1. / n, np.inf)
        for i in range(nactive):
            if (projected_betaE[i] * signs[i] < 0):
                projected_betaE[i] = 0

        projected_cube = np.clip(cube, -1, 1)

        return np.concatenate(
            (projected_alpha, projected_betaE, projected_cube), 0)

    Sigma = np.linalg.inv(np.dot(X[:, active].T, X[:, active]))
    null, alt = pval(init_vec_state, full_projection, X, obs_residuals,
                     beta_unpenalized, full_null, signs, lam, epsilon, nonzero,
                     active, Sigma, weights, randomization_dist,
                     randomization_scale, Langevin_steps, step_size, burning,
                     X_scaled)
    #  Sigma_full[:nactive, :nactive])

    return null, alt
Пример #10
0
def answer_hw():
	##question a
	print '-'*40
	print "question a : rejsampler1d defined"
	print '-'*40
	

	##question b
	print '-'*40
	print "question b: cauchy as reference for laplace"
	print '-'*40

	ref = stats.cauchy(0,1)
	target = stats.laplace(0,1).pdf
	numsamples = 1000
	samples, M, successrate = rejsampler1d(target,ref,numsamples)
	fig = plt.figure()
	ax = fig.add_subplot(111)
	n,bins,patches = ax.hist(samples,100,alpha = 0.75,normed = True,label='Laplace distribution')
	actual = stats.laplace.pdf(np.linspace(-6,6,100))
	ax.plot(np.linspace(-6,6,100),actual,'r',label ='Cauchy distribution')
	plt.legend()

	KStest = stats.kstest(samples,'laplace',(0,1))
	print "Kolmogorov-Smirnov test statistic : %f \np-value : %f " %(KStest)
	if KStest[1]>=0.05:
		keyword = 'not'
	else:
		keyword = ""
	print "samples are %s from laplace(0,1) distribution" %(keyword)

	##question c
	print '-'*40
	print "question c: t distribution with df =2 as reference for laplace"
	print '-'*40
	
	ref = stats.t(2)
	target = stats.laplace(0,1).pdf
	numsamples = 1000
	samples, M, successrate_student = rejsampler1d(target,ref,numsamples)
	if successrate_student < successrate:
		keyword = 'better'
	else:
		keyword = 'worse'
	print "acceptance rate is %f \n" %(successrate)
	print "using student's t distribution for reference is %s than using a cauchy distribution" %(keyword)

	##question d
	print '-'*40
	print "question d: novel continuous distribution"
	print '-'*40
	ref = stats.norm(0,2)
	target = mytargetfunc
	numsamples = 5000
	samples, M, successrate = rejsampler1d(target,ref,numsamples)
	#plot figure
	fig = plt.figure()
	ax = fig.add_subplot(111)
	n,bins,patches = ax.hist(samples,100,alpha = 0.75,normed = True,label = 'target function')
	actual = target(np.linspace(-6,6,100))
	ax.plot(np.linspace(-6,6,100),actual,'r',label = 'normal distribution')
	plt.legend()
Пример #11
0
    return ax


#------------------------------------------------------------
# Set up distributions:
Npts = 5000
np.random.seed(0)
x = np.linspace(-6, 6, 1000)

# Gaussian distribution
data_G = stats.norm(0, 1).rvs(Npts)
pdf_G = stats.norm(0, 1).pdf(x)

# Non-Gaussian distribution
distributions = [stats.laplace(0, 0.4),
                 stats.norm(-4.0, 0.2),
                 stats.norm(4.0, 0.2)]

weights = np.array([0.8, 0.1, 0.1])
weights /= weights.sum()

data_NG = np.hstack(d.rvs(int(w * Npts))
                    for (d, w) in zip(distributions, weights))
pdf_NG = sum(w * d.pdf(x)
             for (d, w) in zip(distributions, weights))

#------------------------------------------------------------
# Plot results
fig = plt.figure(figsize=(5, 2.5))
fig.subplots_adjust(hspace=0, left=0.07, right=0.95, wspace=0.05, bottom=0.15)
Пример #12
0
    'name': "normal",
    'a': -4.0,
    'b': 4.0,
    'stat': stats.norm(loc=0, scale=1),
    'pf': lambda x: stats.norm(loc=0, scale=1).pdf(x)
}, {
    'name': "cauchy",
    'a': -4.0,
    'b': 4.0,
    'stat': stats.cauchy(loc=0, scale=1),
    'pf': lambda x: stats.cauchy(loc=0, scale=1).pdf(x)
}, {
    'name': "laplace",
    'a': -4.0,
    'b': 4.0,
    'stat': stats.laplace(loc=0, scale=1 / math.sqrt(2)),
    'pf': lambda x: stats.laplace(loc=0, scale=1 / math.sqrt(2)).pdf(x)
}, {
    'name': "uniform",
    'a': -4.0,
    'b': 4.0,
    'stat': stats.uniform(-math.sqrt(3), 2 * math.sqrt(3)),
    'pf': lambda x: stats.uniform(-math.sqrt(3), 2 * math.sqrt(3)).pdf(x)
}, {
    'name': "poisson",
    'a': 6,
    'b': 14,
    'stat': stats.poisson(10),
    'pf': lambda x: stats.poisson(10).pmf(np.ceil(x))
}]
Пример #13
0
    return ax


#------------------------------------------------------------
# Set up distributions:
Npts = 5000
np.random.seed(0)
x = np.linspace(-6, 6, 1000)

# Gaussian distribution
data_G = stats.norm(0, 1).rvs(Npts)
pdf_G = stats.norm(0, 1).pdf(x)

# Non-Gaussian distribution
distributions = [stats.laplace(0, 0.4),
                 stats.norm(-4.0, 0.2),
                 stats.norm(4.0, 0.2)]

weights = np.array([0.8, 0.1, 0.1])
weights /= weights.sum()

data_NG = np.hstack(d.rvs(int(w * Npts))
                    for (d, w) in zip(distributions, weights))
pdf_NG = sum(w * d.pdf(x)
             for (d, w) in zip(distributions, weights))

#------------------------------------------------------------
# Plot results
fig = plt.figure(figsize=(10, 5))
fig.subplots_adjust(hspace=0, left=0.05, right=0.95, wspace=0.05)
Пример #14
0
def test_kfstep(k=4, s=3, n=100, p=10, Langevin_steps=10000, burning=2000):

    X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, random_signs=True, s=s, sigma=1.,rho=0, signal=10)[:5]
    epsilon = 0.

    randomization = laplace(loc=0, scale=1.)

    j_seq = np.empty(k, dtype=int)
    s_seq = np.empty(k)

    left = np.ones(p, dtype=bool)
    obs = 0

    initial_state = np.zeros(n + np.sum([i for i in range(p-k+1,p+1)]))
    initial_state[:n] = y.copy()

    mat = [np.array((n, ncol)) for ncol in range(p,p-k,-1)]

    curr = n

    keep = np.zeros(p, dtype=bool)

    for i in range(k):
        X_left = X[:,left]
        X_selected = X[:, ~left]
        if (np.sum(left)<p):
            P_perp = np.identity(n) - X_selected.dot(np.linalg.pinv(X_selected))
            mat[i] = P_perp.dot(X_left)
        else:
            mat[i] = X

        mat_complete = np.zeros((n,p))
        mat_complete[:, left] = mat[i]

        T = np.dot(mat[i].T, y)
        T_complete = np.dot(mat_complete.T, y)

        obs = np.max(np.abs(T))
        keep = np.copy(~left)

        random_Z = randomization.rvs(T.shape[0])
        T_random = T + random_Z
        initial_state[curr:(curr+p-i)] = T_random # initializing subgradients
        curr = curr + p-i

        j_seq[i] = np.argmax(np.abs(T_random))
        s_seq[i] = np.sign(T_random[j_seq[i]])

        #def find_index(v, idx1):
        #    _sumF = 0
        #    _sumT = 0
        #    idx = idx1+1
        #    for i in range(v.shape[0]):
        #        if (v[i] == False):
        #            _sumF = _sumF + 1
        #        else:
        #           _sumT = _sumT + 1
        #        if _sumT >= idx: break
        #    return (_sumT + _sumF-1)

        T_complete[left] += random_Z
        left[np.argmax(np.abs(T_complete))] = False


    # conditioning
    linear_part = X[:, keep].T
    P = np.dot(linear_part.T, np.linalg.pinv(linear_part).T)
    I = np.identity(linear_part.shape[1])
    R = I - P


    def full_projection(state, n=n, p=p, k=k):
        """
        """
        new_state = np.empty(state.shape, np.float)
        new_state[:n] = state[:n]
        curr = n
        for i in range(k):
            projection = projection_cone(p-i, j_seq[i], s_seq[i])
            new_state[curr:(curr+p-i)] = projection(state[curr:(curr+p-i)])
            curr = curr+p-i
        return new_state


    def full_gradient(state, n=n, p=p, k=k, X=X, mat=mat):
        data = state[:n]

        grad = np.empty(n + np.sum([i for i in range(p-k+1,p+1)]))
        grad[:n] = - data

        curr = n
        for i in range(k):
            subgrad = state[curr:(curr+p-i)]

            sign_vec = np.sign(-mat[i].T.dot(data) + subgrad)
            grad[curr:(curr + p - i)] = -sign_vec
            curr = curr+p-i
            grad[:n] += mat[i].dot(sign_vec)

        return grad



    sampler = projected_langevin(initial_state,
                                 full_gradient,
                                 full_projection,
                                 1./p)
    samples = []


    for i in range(Langevin_steps):
        if i>burning:
            old_state = sampler.state.copy()
            old_data = old_state[:n]
            sampler.next()
            new_state = sampler.state.copy()
            new_data = new_state[:n]
            new_data = np.dot(P, old_data) + np.dot(R, new_data)
            sampler.state[:n] = new_data
            samples.append(sampler.state.copy())


    samples = np.array(samples)
    Z = samples[:,:n]

    pop = np.abs(mat[k-1].T.dot(Z.T)).max(0)
    fam = discrete_family(pop, np.ones_like(pop))
    pval = fam.cdf(0, obs)
    pval = 2 * min(pval, 1 - pval)

    #stop

    print('pvalue:', pval)
    return pval
# Show the probability of a gap at least as big as 0, 0.5 and 1.0.

from scipy.special import kolmogorov
from scipy.stats import kstwobign
kolmogorov([0, 0.5, 1.0])
# array([ 1.        ,  0.96394524,  0.26999967])

# Compare a sample of size 1000 drawn from a Laplace(0, 1) distribution against
# the target distribution, a Normal(0, 1) distribution.

from scipy.stats import norm, laplace
n = 1000
np.random.seed(seed=233423)
lap01 = laplace(0, 1)
x = np.sort(lap01.rvs(n))
np.mean(x), np.std(x)
# (-0.083073685397609842, 1.3676426568399822)

# Construct the Empirical CDF and the K-S statistic Dn.

target = norm(0,1)  # Normal mean 0, stddev 1
cdfs = target.cdf(x)
ecdfs = np.arange(n+1, dtype=float)/n
gaps = np.column_stack([cdfs - ecdfs[:n], ecdfs[1:] - cdfs])
Dn = np.max(gaps)
Kn = np.sqrt(n) * Dn
print('Dn=%f, sqrt(n)*Dn=%f' % (Dn, Kn))
# Dn=0.058286, sqrt(n)*Dn=1.843153
print(chr(10).join(['For a sample of size n drawn from a N(0, 1) distribution:',
  ' the approximate Kolmogorov probability that sqrt(n)*Dn>=%f is %f' %  (Kn, kolmogorov(Kn)),
  ' the approximate Kolmogorov probability that sqrt(n)*Dn<=%f is %f' %  (Kn, kstwobign.cdf(Kn))]))
Пример #16
0
    return ax


#------------------------------------------------------------
# Set up distributions:
Npts = 5000
np.random.seed(0)
x = np.linspace(-6, 6, 1000)

# Gaussian distribution
data_G = stats.norm(0, 1).rvs(Npts)
pdf_G = stats.norm(0, 1).pdf(x)

# Non-Gaussian distribution
distributions = [
    stats.laplace(0, 0.4),
    stats.norm(-4.0, 0.2),
    stats.norm(4.0, 0.2)
]

weights = np.array([0.8, 0.1, 0.1])
weights /= weights.sum()

data_NG = np.hstack(
    d.rvs(int(w * Npts)) for (d, w) in zip(distributions, weights))
pdf_NG = sum(w * d.pdf(x) for (d, w) in zip(distributions, weights))

#------------------------------------------------------------
# Plot results
fig = plt.figure(figsize=(10, 5))
fig.subplots_adjust(hspace=0, left=0.05, right=0.95, wspace=0.05)
Пример #17
0
import math

# distribution
# In probability theory and statistics, the Laplace distribution is a continuous probability distribution
# named after Pierre-Simon Laplace.
# It is also sometimes called the double exponential distribution,
# because it can be thought of as two exponential distributions
# (with an additional location parameter) spliced together back-to-back,
# although the term is also sometimes used to refer to the Gumbel distribution.
# The difference between two independent identically distributed exponential random variables
# is governed by a Laplace distribution, as is a Brownian motion evaluated at an exponentially distributed random time.
# Increments of Laplace motion or a variance gamma process evaluated over the time scale also have a Laplace distribution.

ex = -1.0
scale = 2.0
distribution = stats.laplace(loc=ex, scale=scale)

# calculate the real dispersion/variance for the given laplasse distribution
# it is 2*scale^2: 2 * 4 = 8
dx = distribution.var()

# generate 1000 values from distribution for the hist vs pdf plot
values = distribution.rvs(size=1000)

# x axis bounds
left = -10
right = 10

# hist and probability density function
plt.hist(values, 50, normed=True)
x = np.linspace(left, right, num=100)
Пример #18
0
from scipy.stats import norminvgauss, laplace, poisson, cauchy, uniform
import numpy as np
import matplotlib.pyplot as plt
import math as m

sizes = [10, 50, 1000]

rv_n = norminvgauss(1, 0)
rv_l = laplace(scale=1 / m.sqrt(2), loc=0)
rv_p = poisson(10)
rv_c = cauchy()
rv_u = uniform(loc=-m.sqrt(3), scale=2 * m.sqrt(3))

densities = [rv_n, rv_l, rv_p, rv_c, rv_u]
names = ["Normal", "Laplace", "Poisson", "Cauchy", "Uniform"]

for size in sizes:
    n = norminvgauss.rvs(1, 0, size=size)
    l = laplace.rvs(size=size, scale=1 / m.sqrt(2), loc=0)
    p = poisson.rvs(10, size=size)
    c = cauchy.rvs(size=size)
    u = uniform.rvs(size=size, loc=-m.sqrt(3), scale=2 * m.sqrt(3))
    distributions = [n, l, p, c, u]
    build = list(zip(distributions, densities, names))
    for histogram, density, name in build:
        fig, ax = plt.subplots(1, 1)
        ax.hist(histogram,
                density=True,
                histtype='stepfilled',
                alpha=0.6,
                color="green")
Пример #19
0
def test_lasso(s=5, n=500, p=20, randomization=laplace(0, 1)):
    """ Returns null and alternative values for the lasso.

    Model chosen by lasso (non-randomized), inference done as if we randomized.
    """
    X, y, _, nonzero, sigma = instance(n=n,
                                       p=p,
                                       random_signs=True,
                                       s=s,
                                       sigma=1.,
                                       rho=0)
    print 'XTy', np.dot(X.T, y)
    lam_frac = 1.

    lam = sigma * lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0))

    #penalty = glm.gaussian(X, Y, coef=1. / sigma**2, quadratic=quadratic)
    #loss =
    #problem = rr.simple_problem(loss, penalty)
    #solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500})
    #initial_soln = problem.solve(**solve_args)
    clf = linear_model.Lasso(
        alpha=lam /
        (2 * float(n)))  # should be alpha = lam/float(n) to be consistent
    clf.fit(X, y)
    soln = clf.coef_
    active = (soln != 0)  # boolean vector
    active_set = np.where(active)[
        0]  # column numbers of covariates chosen by lasso
    # print 'active', active
    print 'active_set', active_set
    active_size = np.sum(active)
    print 'size of the active set', active_size

    inactive = ~active
    signs = np.sign(soln[active])

    print 'true support', nonzero
    # LASSO region Ay < b
    pseudo_X_M = np.linalg.pinv(X[:, active])
    pseudo_XT_M = np.linalg.pinv(X[:, active].T)

    P_M = np.dot(X[:, active], pseudo_X_M)
    #print 'active', X[:, active_set]
    #print np.dot(P_M, X[:, active_set])
    A01 = np.dot(X[:, inactive].T, np.identity(n) - P_M) / lam
    A02 = -A01.copy()
    #print 'A01',A01
    #print 'A02',A02
    A0 = np.concatenate((A01, A02), axis=0)
    #print 'A0', A0

    A1 = -np.dot(np.diag(signs), pseudo_X_M)
    A = np.concatenate((A0, A1), axis=0)
    #print signs
    #print pseudo_X_M
    #print A1
    b01 = np.ones(p - active_size) - np.dot(
        np.dot(X[:, inactive].T, pseudo_XT_M), signs)
    b02 = np.ones(p - active_size) + np.dot(
        np.dot(X[:, inactive].T, pseudo_XT_M), signs)
    b0 = np.concatenate((b01, b02), axis=0)
    mat = np.linalg.inv(np.dot(X[:, active].T, X[:, active]))
    b1 = -lam * np.dot(np.dot(np.diag(signs), mat), signs)
    b = np.concatenate((b0, b1), axis=0)

    beta_bar = np.linalg.lstsq(X[:, active], y)[0]

    null, alt = [], []

    for i, j in enumerate(
            active_set):  # testing beta_i=0, corresponds to column X_j
        boot_samples, comparison = bootstrap(y, X, active, i, j)
        prob_selection = randomization_cdf(randomization, boot_samples, A, b)
        # print 'comparison', np.sum(comparison)
        # print np.asarray(comparison, dtype=int).shape
        num = np.inner(np.asarray(comparison, dtype=int),
                       np.asarray(prob_selection))
        #print 'num', num
        den = np.sum(np.asarray(prob_selection))
        #print 'den', den
        p_value = num / den
        #p_value = 2 * min(p_value, 1-p_value)
        obs = beta_bar[i]
        print "observed: ", obs, "p value: ", p_value
        if j in nonzero:
            alt.append(p_value)
        else:
            null.append(p_value)
    return null, alt
Пример #20
0
import scipy.stats as sts
get_ipython().run_line_magic('matplotlib', 'inline')
import math

# # Определяем распределение Лапласса #

#
# Информацию можно почитать [тут](https://ru.wikipedia.org/wiki/%D0%A0%D0%B0%D1%81%D0%BF%D1%80%D0%B5%D0%B4%D0%B5%D0%BB%D0%B5%D0%BD%D0%B8%D0%B5_%D0%9B%D0%B0%D0%BF%D0%BB%D0%B0%D1%81%D0%B0)
#

# Среднее:β
# Дисперсия:β

# In[132]:

laplace_rv = sts.laplace(5)  #задаем функцию
sample = laplace_rv.rvs(1000)  #Сделаем выборку из 1000 значений

# In[53]:

print(sample)

# In[133]:

#посчитаем среднее и дисперсию
xm = 1.  #минимальное значение
E = 5  #среднее(мат ожидание)
D = 2  #дисперсия
print(E)
print(D)
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
'''
sub-Gaussian / super-Gaussian 的区别在于峰态是大于还是小于零,大于零的,则尖峰比正态分布高,
相应的具有更长的尾巴,是super-Gaussian,小于零的则比正态分布要更平缓,是sub-Gaussian,在ICA中
p(z)是sub还是super的Gaussian是很重要的,至于是具体哪种sub或者super的分布就不重要了
'''

np.random.seed(0)

# generate data and samples
x = np.linspace(-4, 4, 500)
rv1 = ss.norm()
rv2 = ss.laplace(0, 1)
rv3 = ss.uniform(-2, 4)  # uniform的参数有点奇怪,比如这里就表示取值范围是(-2, -2+4)

pdf1 = rv1.pdf(x)
pdf2 = rv2.pdf(x)
pdf3 = rv3.pdf(x)

N = 5000  # Monte Carlo No.
gaussian_x1, gaussian_x2 = rv1.rvs(N), rv1.rvs(N)
laplace_x1, laplace_x2 = rv2.rvs(N), rv2.rvs(N)
uniform_x1, uniform_x2 = rv3.rvs(N), rv3.rvs(N)

# plots
fig = plt.figure(figsize=(11, 9))
fig.canvas.set_window_title('subSuperGaussPlot')

ax = plt.subplot(221)
Пример #22
0
 def __init__(self, mu, b=1):
     self.mu = mu
     self.b = b
     self.distribution = laplace(loc=mu, scale=b)
# Take a subsample of the function evaluations to use in the fit
subsample_indicator = (rand(Ny) <= SUBSAMPLE_PROBABILITY)
_hgx = hgx
_hgy = hgy
hgx = hgx[subsample_indicator]
hgy = hgy[subsample_indicator]
Ny = hgy.size



# Noise samples (Mix of Laplacian and Normal)
# ------------------------------
laplace_indicator = (rand(Ny) <= LAPLACE_PROBABILITY)
hgsigma = SHOT_NOISE * sqrt(hgy) 
laplace_sigma = LAPLACIAN_SIGMA_SCALE * hgsigma
samples_laplace = laplace().rvs(hgy.size) * laplace_sigma
normal_sigma = hgsigma
samples_normal = norm().rvs(hgy.size) * normal_sigma
hgnoise = (
    laplace_indicator * samples_laplace + (1-laplace_indicator) * samples_normal)
hgy_noisy = hgy + hgnoise


# Forward model
# -----------------

# lets use sin and cos
# --
N = NUMBER_TERMS
A = zeros((N, hgx.size))
__A = zeros((N, _hgx.size))
Пример #24
0
"""
# =============================================================================

import matplotlib.pyplot as plt
import numpy as np

from matplotlib.gridspec import GridSpec
from scipy import stats

np.random.seed(565656)

dists = [stats.norm(0, 1),
         stats.uniform(-np.sqrt(3), np.sqrt(3)),
         stats.cauchy(),
         stats.expon(1),
         stats.laplace(np.sqrt(2))]

labels = [r'$\mathcal{N}(0, 1)$',
          r'$\mathcal{U}(-\sqrt{3}, \sqrt{3})$',
          'Cauchy',
          r'$\lambda e^{-\lambda x}, \lambda = 1$',
          r'$\frac{\lambda}{2} e^{-\lambda\|x\|}, \lambda = \sqrt{2}$']

N = 1000

fig = plt.figure(1, clear=True)
gs = GridSpec(nrows=2, ncols=3)

for i, (dist, label) in enumerate(zip(dists, labels)):
    ax = fig.add_subplot(gs[i])
Пример #25
0
def adaptive_integrate(f1, f2, key, value):
    """inputs:
       f1: function 1 of x, function string
       f2: function 2 of x, function string
       key: distribution type of random variable, string
       value: parameters of random distribution, tuple
       outputs:
       y: integral value
    """

    if key.startswith('Uniform'):
        # stats.uniform defined in the range of [0, 1]
        # we have to convert it to [-1, 1] for the definition of Legendre basis
        # stats.uniform(location, scale)
        # or we can also do arbitrary type, will work on this later
        f_distr = stats.uniform(-1, 2)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, -1, 1)

    elif key.startswith('Gaussian'):
        # this is for hermite polynomial basis
        # we can do arbitrary type by not using standard normal distribution
        # will work on this later
        f_distr = stats.norm(0, 1)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, -npy.inf, npy.inf)

    elif key.startswith('Gamma'):
        # compare the stats.gamma with the one showed in UQLab tutorial (input)
        # stats.gamma accepts only one value, but UQLab accepts two
        # we can do the location and scale to make them the same
        # argument "1" is for the "standardized" format
        # or we can do arbitrary type later
        # value[0]: lambda, value[1]: k (a for stats.gamma)
        a = value[1]
        loc = 0
        scale = 1. / value[0]  # stats.gamma uses "beta" instead of "lambda"
        f_distr = stats.gamma(a, loc, scale)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, 0, npy.inf)

    elif key.startswith('Beta'):
        # compare the stats.beta with the one showed in UQLab tutorial (input)
        # stats.beta accepts only one value, but UQLab accepts two
        # we can do the location and scale to make them the same
        # value[0]: alpha, value[1]: beta, no "loc" or "scale" needed
        # always in the range of [0, 1]
        alpha = value[0]
        beta = value[1]
        f_distr = stats.beta(alpha, beta)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, 0, 1)

    elif key.startswith('Exponential'):
        # value: lambda
        loc = 0
        scale = 1. / value
        f_distr = stats.expon(loc, scale)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, 0, npy.inf)

    elif key.startswith('Lognormal'):
        # this part is very interesting
        # in UQLab they do Hermite for lognormal
        # and U the same as those from gaussian
        # then convert U to X using exp(U)
        # or they can specify arbitrary polynomial basis to be the same as here
        # we can do both, actually

        # value[0]: mu, value[1]:sigma
        s = value[1]
        loc = 0
        scale = npy.exp(value[0])
        f_distr = stats.lognorm(s, loc, scale)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, 0, npy.inf)

    elif key.startswith('Gumbel'):
        # compare the stats.gumbel_r with the one showed in UQLab tutorial (input)
        # stats.gamma accepts only one value, but UQLab accepts two
        # we can do the location and scale to make them the same
        # value[0]: mu, value[1]: beta
        loc = value[0]
        scale = value[1]
        f_distr = stats.gumbel_r(loc, scale)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, -npy.inf, npy.inf)

    elif key.startswith('Weibull'):
        # compare the stats.weibull_min with the one showed in UQLab tutorial (input)
        # stats.gamma accepts only one value, but UQLab accepts two
        # we can do the location and scale to make them the same
        # value[0]: lambda, value[1]: k
        k = value[1]
        loc = 0
        scale = value[0]
        f_distr = stats.weibull_min(k, loc, scale)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, 0, npy.inf)

    elif key.startswith('Triangular'):
        # compare the stats.triang with the one showed in UQLab tutorial (input)
        # stats.gamma accepts only one value, but UQLab accepts two
        # we can do the location and scale to make them the same
        # value: c, no "loc" and "scale" needed
        # always in the range of [0, 1]
        c = value
        f_distr = stats.triang(c)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, 0, 1)

    elif key.startswith('Logistic'):
        # compare the stats.logistic with the one showed in UQLab tutorial (input)
        # stats.gamma accepts only one value, but UQLab accepts two
        # we can do the location and scale to make them the same
        # value[0]: location, value[1]: scale
        loc = value[0]
        scale = value[1]
        f_distr = stats.logistic(loc, scale)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, -npy.inf, npy.inf)

    elif key.startswith('Laplace'):
        # compare the stats.laplace with the one showed in UQLab tutorial (input)
        # stats.gamma accepts only one value, but UQLab accepts two
        # we can do the location and scale to make them the same
        # value[0]: location, value[1]: scale
        loc = value[0]
        scale = value[1]
        f_distr = stats.laplace(loc, scale)
        f0 = lambda x: f_distr.pdf(x)
        f = lambda x: f1(x) * f2(x) * f0(x)
        y = integrate.quad(f, -npy.inf, npy.inf)

    else:
        print 'other types of statistical distributsions are coming soon ...'

    return y[0]
Пример #26
0
def case1(index=CASE_1_ATTRIBUTE_INDEX,output=True,ret='accuracy'):

    accuracy_in_each_turn = list()

    precision_in_each_turn_spam = list()
    recall_in_each_turn_spam = list()

    precision_in_each_turn_ham = list()
    recall_in_each_turn_ham = list()

    m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',')

    shuffled = np.random.permutation(m)

    valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO)

    # equiprobable priors
    prior_spam = 0.5
    prior_ham = 0.5


    for i in xrange(NUMBER_OF_ROUNDS):

        # we're using cross-validation so each iteration we take a different
        # slice of the data to serve as test set
        train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i)

        #parameter estimation
        sample_mean_word_spam = nb.take_mean_spam(train_set,index,SPAM_ATTR_INDEX)

        sample_mean_word_ham = nb.take_mean_ham(train_set,index,SPAM_ATTR_INDEX)

        sample_variance_word_spam = nb.take_variance_spam(train_set,index,SPAM_ATTR_INDEX)

        sample_variance_word_ham = nb.take_variance_ham(train_set,index,SPAM_ATTR_INDEX)

        #sample standard deviations from sample variance
        sample_std_dev_spam = sample_variance_word_spam ** (1/2.0)
        
        sample_std_dev_ham = sample_variance_word_ham ** (1/2.0) 

        hits = 0.0
        misses = 0.0

        #number of instances corretcly evaluated as spam
        correctly_is_spam = 0.0

        #total number of spam instances
        is_spam = 0.0

        #total number of instances evaluated as spam
        guessed_spam = 0.0

        #number of instances correctly evaluated as ham
        correctly_is_ham = 0.0

        #total number of ham instances
        is_ham = 0.0

        #total number of instances evaluated as ham
        guessed_ham = 0.0

        

        # now we test the hypothesis against the test set
        for row in test_set:
            
            # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior!
            posterior_spam = prior_spam * stats.laplace(sample_mean_word_spam, sample_std_dev_spam).pdf(row[index])

            posterior_ham = prior_ham * stats.laplace(sample_mean_word_ham, sample_std_dev_ham).pdf(row[index])
    
            # whichever is greater - that will be our evaluation
            if posterior_spam > posterior_ham:
                guess = 1
            else:
                guess = 0


            if(row[SPAM_ATTR_INDEX] == guess):
                hits += 1
            else:
                misses += 1

            # we'll use these to calculate metrics
            if (row[SPAM_ATTR_INDEX] == 1 ):
                is_spam += 1
                
                if guess == 1:
                    guessed_spam += 1
                    correctly_is_spam += 1
                else:
                    guessed_ham += 1
            else:
                is_ham += 1

                if guess == 1:
                    guessed_spam += 1
                else:
                    guessed_ham += 1
                    correctly_is_ham += 1
          

        #accuracy = number of correctly evaluated instances/
        #           number of instances
        #
        #
        accuracy = hits/(hits+misses)


        #precision_spam = number of correctly evaluated instances as spam/
        #            number of spam instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(is_spam == 0):
            precision_spam = 0
        else:
            precision_spam = correctly_is_spam/is_spam

        #recall_spam = number of correctly evaluated instances as spam/
        #         number of evaluated instances como spam
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_spam == 0):
            recall_spam = 0
        else:
            recall_spam = correctly_is_spam/guessed_spam

        #precision_ham = number of correctly evaluated instances as ham/
        #            number of ham instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(is_ham == 0):
            precision_ham = 0
        else:
            precision_ham = correctly_is_ham/is_ham

        #recall_ham = number of correctly evaluated instances as ham/
        #         number of evaluated instances como ham
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_ham == 0):
            recall_ham = 0
        else:
            recall_ham = correctly_is_ham/guessed_ham

        accuracy_in_each_turn.append(accuracy)

        precision_in_each_turn_spam.append(precision_spam)
        recall_in_each_turn_spam.append(recall_spam)

        precision_in_each_turn_ham.append(precision_ham)
        recall_in_each_turn_ham.append(recall_ham)

    # calculation of means for each metric at the end

    mean_accuracy = np.mean(accuracy_in_each_turn)
    std_dev_accuracy = np.std(accuracy_in_each_turn)
    variance_accuracy = np.var(accuracy_in_each_turn)

    mean_precision_spam = np.mean(precision_in_each_turn_spam)
    std_dev_precision_spam = np.std(precision_in_each_turn_spam)
    variance_precision_spam = np.var(precision_in_each_turn_spam)

    mean_recall_spam = np.mean(recall_in_each_turn_spam)
    std_dev_recall_spam = np.std(recall_in_each_turn_spam)
    variance_recall_spam = np.var(recall_in_each_turn_spam)

    mean_precision_ham = np.mean(precision_in_each_turn_ham)
    std_dev_precision_ham = np.std(precision_in_each_turn_ham)
    variance_precision_ham = np.var(precision_in_each_turn_ham)

    mean_recall_ham = np.mean(recall_in_each_turn_ham)
    std_dev_recall_ham = np.std(recall_in_each_turn_ham)
    variance_recall_ham = np.var(recall_in_each_turn_ham)

    if output:
        print "\033[1;32m"
        print '============================================='
        print 'CASE 1 - ONE ATTRIBUTE - USING LAPLACE MODEL'
        print '============================================='
        print "\033[00m"
        print 'MEAN ACCURACY: '+str(round(mean_accuracy,5))
        print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5))
        print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8))
        print ''
        print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5))
        print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5))
        print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8))
        print ''
        print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5))
        print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5))
        print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8))
        print ''
        print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5))
        print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5))
        print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8))
        print ''
        print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5))
        print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5))
        print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8))

    # we'll only use these return values to compute rankings
    # for example in script which_attribute_case_1    
    if ret == 'utility':
        return mean_accuracy * mean_precision_ham
    elif ret =='accuracy':
        return mean_accuracy
    else:
        print 'UNKNOWN METRIC: '+ret
        sys.exit()
Пример #27
0
    from astroML.plotting import setup_text_plots
setup_text_plots(fontsize=8, usetex=True)

#------------------------------------------------------------
# Define the distribution parameters to be plotted
delta_values = [0.5, 1.0, 2.0]
linestyles = ['-', '--', ':']
mu = 0
x = np.linspace(-10, 10, 1000)

#------------------------------------------------------------
# plot the distributions
fig, ax = plt.subplots(figsize=(5, 3.75))

for delta, ls in zip(delta_values, linestyles):
    dist = laplace(mu, delta)

    plt.plot(x,
             dist.pdf(x),
             ls=ls,
             c='black',
             label=r'$\mu=%i,\ \Delta=%.1f$' % (mu, delta))

plt.xlim(-6, 6)
plt.ylim(0, 1.0)

plt.xlabel('$x$')
plt.ylabel(r'$p(x|\mu,\Delta)$')
plt.title('Laplace Distribution')

plt.legend()
Пример #28
0
ax.set_xlim(-5, 5)
ax.set_ylim(0, 0.7001)
ax.set_ylabel('$p(x)$')
ax.xaxis.set_major_formatter(plt.NullFormatter())

# trick to show multiple legends
leg1 = ax.legend([l1], [l1.get_label()], loc=1)
leg2 = ax.legend([l2, l3], (l2.get_label(), l3.get_label()), loc=2)
ax.add_artist(leg1)
ax.set_title('Skew $\Sigma$ and Kurtosis $K$')

# next show distributions with different kurtosis
ax = fig.add_subplot(212)
x = np.linspace(-5, 5, 1000)
l1, = ax.plot(x, stats.laplace(0, 1).pdf(x), '--k',
              label=r'${\rm Laplace,}\ K=+3$')
l2, = ax.plot(x, stats.norm(0, 1).pdf(x), '-k',
              label=r'${\rm Gaussian,}\ K=0$')
l3, = ax.plot(x, stats.cosine(0, 1).pdf(x), '-.k',
              label=r'${\rm Cosine,}\ K=-0.59$')
l4, = ax.plot(x, stats.uniform(-2, 4).pdf(x), ':k',
              label=r'${\rm Uniform,}\ K=-1.2$')

ax.set_xlim(-5, 5)
ax.set_ylim(0, 0.55)
ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)$')

# trick to show multiple legends
leg1 = ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=2)
Пример #29
0
def run(dataset,
        measurements,
        eps=1.0,
        delta=0.0,
        bounded=True,
        engine='MD',
        options={},
        iters=10000,
        seed=None,
        metric='L2',
        elim_order=None,
        frequency=1,
        workload=None):
    """
    Run a mechanism that measures the given measurements and runs inference.
    This is a convenience method for running end-to-end experiments.
    """

    domain = dataset.domain
    total = None

    state = np.random.RandomState(seed)

    if len(measurements) >= 1 and type(measurements[0][0]) is str:
        matrix = lambda proj: sparse.eye(domain.project(proj).size())
        measurements = [(proj, matrix(proj)) for proj in measurements]

    l1 = 0
    l2 = 0
    for _, Q in measurements:
        l1 += np.abs(Q).sum(axis=0).max()
        try:
            l2 += Q.power(2).sum(axis=0).max()  # for spares matrices
        except:
            l2 += np.square(Q).sum(axis=0).max()  # for dense matrices

    if bounded:
        total = dataset.df.shape[0]
        l1 *= 2
        l2 *= 2

    if delta > 0:
        noise = norm(loc=0, scale=np.sqrt(l2 * 2 * np.log(2 / delta)) / eps)
    else:
        noise = laplace(loc=0, scale=l1 / eps)

    if workload is None:
        workload = measurements

    truth = []
    for proj, W, in workload:
        x = dataset.project(proj).datavector()
        y = W.dot(x)
        truth.append((W, y, proj))

    answers = []
    for proj, Q in measurements:
        x = dataset.project(proj).datavector()
        z = noise.rvs(size=Q.shape[0], random_state=state)
        y = Q.dot(x)
        answers.append((Q, y + z, 1.0, proj))

    estimator = FactoredInference(domain,
                                  metric=metric,
                                  iters=iters,
                                  warm_start=False,
                                  elim_order=elim_order)
    logger = Logger(estimator, true_answers=truth, frequency=frequency)
    model = estimator.estimate(answers,
                               total,
                               engine=engine,
                               callback=logger,
                               options=options)

    return model, logger, answers
Пример #30
0
def case2(indexes=CASE_2_ATTRIBUTE_INDEXES,output=True):

    accuracy_in_each_turn = list()

    precision_in_each_turn_spam = list()
    recall_in_each_turn_spam = list()

    precision_in_each_turn_ham = list()
    recall_in_each_turn_ham = list()

    m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',')

    shuffled = np.random.permutation(m)

    valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO)

    # equiprobable priors
    prior_spam = 0.5
    prior_ham = 0.5


    for i in xrange(NUMBER_OF_ROUNDS):

        # we're using cross-validation so each iteration we take a different
        # slice of the data to serve as test set
        train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i)


        #parameter estimation
        #but now we take 10 attributes into consideration
        sample_means_word_spam = list()
        sample_means_word_ham = list()

        sample_variances_word_spam = list()
        sample_variances_word_ham = list()

        for attr_index in indexes:

            sample_means_word_spam.append(nb.take_mean_spam(train_set,attr_index,SPAM_ATTR_INDEX))
            sample_means_word_ham.append(nb.take_mean_ham(train_set,attr_index,SPAM_ATTR_INDEX))

            sample_variances_word_spam.append(nb.take_variance_spam(train_set,attr_index,SPAM_ATTR_INDEX))
            sample_variances_word_ham.append(nb.take_variance_ham(train_set,attr_index,SPAM_ATTR_INDEX))


        #sample standard deviations from sample variances
        sample_std_devs_spam = map(lambda x: x ** (1/2.0), sample_variances_word_spam)
        sample_std_devs_ham = map(lambda x: x ** (1/2.0), sample_variances_word_ham)

        hits = 0.0
        misses = 0.0

        #number of instances correctly evaluated as spam
        correctly_is_spam = 0.0

        #total number of spam instances
        is_spam = 0.0

        #total number of instances evaluated as spam
        guessed_spam = 0.0


        #number of instances correctly evaluated as ham
        correctly_is_ham = 0.0

        #total number of ham instances
        is_ham = 0.0

        #total number of instances evaluated as ham
        guessed_ham = 0.0

        # now we test the hypothesis against the test set
        for row in test_set:

            # ou seja, o produto de todas as prob. condicionais das palavras dada a classe   
            # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =)         
            product_of_all_conditional_probs_spam = reduce(lambda acc,cur: acc * stats.laplace(sample_means_word_spam[cur], sample_std_devs_spam[cur]).pdf(row[indexes[cur]]) , xrange(10), 1)
            # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior!
            posterior_spam = prior_spam * product_of_all_conditional_probs_spam


            product_of_all_conditional_probs_ham = reduce(lambda acc,cur: acc * stats.laplace(sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf(row[indexes[cur]]) , xrange(10), 1)
            posterior_ham = prior_ham * product_of_all_conditional_probs_ham
    
            # whichever is greater - that will be our prediction
            if posterior_spam > posterior_ham:
                guess = 1
            else:
                guess = 0

            if(row[SPAM_ATTR_INDEX] == guess):
                hits += 1
            else:
                misses += 1

            # we'll use these to calculate metrics
            if (row[SPAM_ATTR_INDEX] == 1 ):
                is_spam += 1
                
                if guess == 1:
                    guessed_spam += 1
                    correctly_is_spam += 1
                else:
                    guessed_ham += 1
            else:
                is_ham += 1

                if guess == 1:
                    guessed_spam += 1
                else:
                    guessed_ham += 1
                    correctly_is_ham += 1

        #accuracy = number of correctly evaluated instances/
        #           number of instances
        #
        #
        accuracy = hits/(hits+misses)


        #precision_spam = number of correctly evaluated instances as spam/
        #            number of spam instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(is_spam == 0):
            precision_spam = 0
        else:
            precision_spam = correctly_is_spam/is_spam

        #recall_spam = number of correctly evaluated instances as spam/
        #         number of evaluated instances como spam
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_spam == 0):
            recall_spam = 0
        else:
            recall_spam = correctly_is_spam/guessed_spam

        #precision_ham = number of correctly evaluated instances as ham/
        #            number of ham instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(is_ham == 0):
            precision_ham = 0
        else:
            precision_ham = correctly_is_ham/is_ham

        #recall_ham = number of correctly evaluated instances as ham/
        #         number of evaluated instances como ham
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_ham == 0):
            recall_ham = 0
        else:
            recall_ham = correctly_is_ham/guessed_ham


        accuracy_in_each_turn.append(accuracy)

        precision_in_each_turn_spam.append(precision_spam)
        recall_in_each_turn_spam.append(recall_spam)

        precision_in_each_turn_ham.append(precision_ham)
        recall_in_each_turn_ham.append(recall_ham)


    # calculation of means for each metric at the end

    mean_accuracy = np.mean(accuracy_in_each_turn)
    std_dev_accuracy = np.std(accuracy_in_each_turn)
    variance_accuracy = np.var(accuracy_in_each_turn)

    mean_precision_spam = np.mean(precision_in_each_turn_spam)
    std_dev_precision_spam = np.std(precision_in_each_turn_spam)
    variance_precision_spam = np.var(precision_in_each_turn_spam)

    mean_recall_spam = np.mean(recall_in_each_turn_spam)
    std_dev_recall_spam = np.std(recall_in_each_turn_spam)
    variance_recall_spam = np.var(recall_in_each_turn_spam)

    mean_precision_ham = np.mean(precision_in_each_turn_ham)
    std_dev_precision_ham = np.std(precision_in_each_turn_ham)
    variance_precision_ham = np.var(precision_in_each_turn_ham)

    mean_recall_ham = np.mean(recall_in_each_turn_ham)
    std_dev_recall_ham = np.std(recall_in_each_turn_ham)
    variance_recall_ham = np.var(recall_in_each_turn_ham)

    if output:
        print "\033[1;32m"
        print '============================================='
        print 'CASE 2 - TEN ATTRIBUTES - USING LAPLACE MODEL'
        print '============================================='
        print "\033[00m"
        print 'MEAN ACCURACY: '+str(round(mean_accuracy,5))
        print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5))
        print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8))
        print ''
        print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5))
        print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5))
        print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8))
        print ''
        print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5))
        print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5))
        print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8))
        print ''
        print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5))
        print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5))
        print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8))
        print ''
        print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5))
        print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5))
        print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8))
Пример #31
0
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt

distributions = {
    'normal': st.norm(loc=0, scale=1),
    'laplace': st.laplace(loc=0, scale=1 / np.sqrt(2)),
    'cauchy': st.cauchy(),
    'uniform': st.uniform(loc=-np.sqrt(3), scale=2 * np.sqrt(3)),
    'poisson': st.poisson(5)
}


def Zr(x):
    return (np.amin(x) + np.amax(x)) / 2


def Zq(x):
    return (np.quantile(x, 1 / 4) + np.quantile(x, 3 / 4)) / 2


def Ztr(x):
    n = x.size
    r = (int)(n / 4)
    sum1 = 0
    for i in range(r, n - r):
        sum1 += x[i]
    return sum1 / (n - 2 * r)


pos_characteristics = {
Пример #32
0
 def _kstest(self, loc, scale, samples):
     # Uses the Kolmogorov-Smirnov test for goodness of fit.
     ks, _ = sp_stats.kstest(samples,
                             sp_stats.laplace(loc, scale=scale).cdf)
     # Return True when the test passes.
     return ks < 0.02
Пример #33
0
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import laplace

# Set true theta parameter
theta = 5

# Define the random variable based on that
f_Xi = laplace(loc=theta, scale=1)

# Set sample size (n) and number of bootstrap iteration (B)
n = 50
B = 100000

# Compute Cramér-Rao lower bound (CRLB)
CRLB = 1 / n


# Define theta_hat (which uses all bootstrap samples as an input and calculates B theta_hats)
def theta_hats(X):
    return np.median(X, axis=0)


# Calculate variance of theta_hat across bootstrap samples
var_theta_hats = theta_hats(f_Xi.rvs(size=(n, B))).var()

# Print the CRLB, variance of the theta_hats, and percentage deviance
print('CRLB = ' + str(CRLB) + ', Var(theta_hat_MM) = ' + str(var_theta_hats) +
      r', % deviation: ' + str(np.abs(var_theta_hats - CRLB) / CRLB * 100))
Пример #34
0
plt.figure(3)
plt.plot(support[ix], rv.pdf(support[ix]), label='Actual')
plt.plot(support[ix], dens_normal.pdf()[ix], label='Scott')
plt.plot(support[ix], dens_cvls.pdf()[ix], label='CV_LS')
plt.plot(support[ix], dens_cvml.pdf()[ix], label='CV_ML')
plt.title("Nonparametric Estimation of the Density of Pareto " \
          "Distributed Random Variable")
plt.legend(('Actual', 'Scott', 'CV_LS', 'CV_ML'))

# Laplace Distribution
mu = 0
s = 1
nobs = 250

support = np.random.laplace(mu, s, size=nobs)
rv = stats.laplace(mu, s)
ix = np.argsort(support)

dens_normal = KDEMultivariate(data=[support], var_type='c', bw='normal_reference')
dens_cvls = KDEMultivariate(data=[support], var_type='c', bw='cv_ls')
dens_cvml = KDEMultivariate(data=[support], var_type='c', bw='cv_ml')

plt.figure(4)
plt.plot(support[ix], rv.pdf(support[ix]), label='Actual')
plt.plot(support[ix], dens_normal.pdf()[ix], label='Scott')
plt.plot(support[ix], dens_cvls.pdf()[ix], label='CV_LS')
plt.plot(support[ix], dens_cvml.pdf()[ix], label='CV_ML')
plt.title("Nonparametric Estimation of the Density of Laplace " \
          "Distributed Random Variable")
plt.legend(('Actual', 'Scott', 'CV_LS', 'CV_ML'))
Пример #35
0
import scipy.stats as st
import numpy as np

norm = st.norm(loc=0, scale=1)

N_laplace = 25
laplace = st.laplace(loc=0, scale=1 / np.sqrt(2))
x_laplace = laplace.rvs(N_laplace)
file = open('laplace2.txt', 'w')

m = x_laplace.mean()
sigma = x_laplace.std()
file.write('m = ' + str(m) + 'sigma =  ' + str(sigma) + '\n')
k = int(1.72 * np.cbrt(N_laplace))

if k % 2 == 1:
    k -= 1

left = -0.2
right = 0.2
step = (right - left) / (k - 2)

delta = [left + i * step for i in range(0, k - 1)]

n = np.zeros(k)
for r in x_laplace:
    last = True
    for i in range(0, len(delta)):
        if r < delta[i]:
            n[i] += 1
            last = False
Пример #36
0
#   "Statistics, Data Mining, and Machine Learning in Astronomy" (2013)
#   For more information, see http://astroML.github.com
import numpy as np
from scipy.stats import laplace
from matplotlib import pyplot as plt

#------------------------------------------------------------
# Define the distribution parameters to be plotted
delta_values = [0.5, 1.0, 2.0]
linestyles = ['-', '--', ':']
mu = 0
x = np.linspace(-10, 10, 1000)

#------------------------------------------------------------
# plot the distributions
for delta, ls in zip(delta_values, linestyles):
    dist = laplace(mu, delta)

    plt.plot(x, dist.pdf(x), ls=ls, c='black',
             label=r'$\mu=%i,\ \Delta=%.1f$' % (mu, delta), lw=2)

plt.xlim(-7, 7)
plt.ylim(0, 1.1)

plt.xlabel('$x$', fontsize=14)
plt.ylabel(r'$P(x|\mu,\Delta)$', fontsize=14)
plt.title('Laplace Distribution')

plt.legend()
plt.show()
Пример #37
0
ax.set_xlim(-5, 5)
ax.set_ylim(0, 0.7001)
ax.set_ylabel('$p(x)$')
ax.xaxis.set_major_formatter(plt.NullFormatter())

# trick to show multiple legends
leg1 = ax.legend([l1], [l1.get_label()], loc=1)
leg2 = ax.legend([l2, l3], (l2.get_label(), l3.get_label()), loc=2)
ax.add_artist(leg1)
ax.set_title('Skew $\Sigma$ and Kurtosis $K$')

# next show distributions with different kurtosis
ax = fig.add_subplot(212)
x = np.linspace(-5, 5, 1000)
l1, = ax.plot(x,
              stats.laplace(0, 1).pdf(x),
              '--k',
              label=r'${\rm Laplace,}\ K=+3$')
l2, = ax.plot(x,
              stats.norm(0, 1).pdf(x),
              '-k',
              label=r'${\rm Gaussian,}\ K=0$')
l3, = ax.plot(x,
              stats.cosine(0, 1).pdf(x),
              '-.k',
              label=r'${\rm Cosine,}\ K=-0.59$')
l4, = ax.plot(x,
              stats.uniform(-2, 4).pdf(x),
              ':k',
              label=r'${\rm Uniform,}\ K=-1.2$')
Пример #38
0
def case3(output=True):

    accuracy_in_each_turn = list()

    precision_in_each_turn_spam = list()
    recall_in_each_turn_spam = list()

    precision_in_each_turn_ham = list()
    recall_in_each_turn_ham = list()

    m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',')

    shuffled = np.random.permutation(m)

    valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO)

    # equiprobable priors
    prior_spam = 0.5
    prior_ham = 0.5


    for i in xrange(NUMBER_OF_ROUNDS):

        # we're using cross-validation so each iteration we take a different
        # slice of the data to serve as test set
        train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i)

        #parameter estimation
        #but now we take ALL attributes into consideration
        sample_means_word_spam = list()
        sample_means_word_ham = list()

        sample_variances_word_spam = list()
        sample_variances_word_ham = list()

        # all but the last one
        for attr_index in xrange(57):

            sample_means_word_spam.append(nb.take_mean_spam(train_set,attr_index,SPAM_ATTR_INDEX))
            sample_means_word_ham.append(nb.take_mean_ham(train_set,attr_index,SPAM_ATTR_INDEX))

            sample_variances_word_spam.append(nb.take_variance_spam(train_set,attr_index,SPAM_ATTR_INDEX))
            sample_variances_word_ham.append(nb.take_variance_ham(train_set,attr_index,SPAM_ATTR_INDEX))


        #sample standard deviations from sample variances
        sample_std_devs_spam = map(lambda x: x ** (1/2.0), sample_variances_word_spam)
        sample_std_devs_ham = map(lambda x: x ** (1/2.0), sample_variances_word_ham)

        hits = 0.0
        misses = 0.0

        #number of instances correctly evaluated as spam
        correctly_is_spam = 0.0

        #total number of spam instances
        is_spam = 0.0

        #total number of instances evaluated as spam
        guessed_spam = 0.0


        #number of instances correctly evaluated as ham
        correctly_is_ham = 0.0

        #total number of ham instances
        is_ham = 0.0

        #total number of instances evaluated as ham
        guessed_ham = 0.0

        # now we test the hypothesis against the test set
        for row in test_set:

            # ou seja, o produto de todas as prob. condicionais das palavras dada a classe   
            # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =)         
            product_of_all_conditional_probs_spam = reduce(lambda acc,cur: acc * stats.laplace(sample_means_word_spam[cur], sample_std_devs_spam[cur]).pdf(row[CASE_2_ATTRIBUTE_INDEXES[cur]]) , xrange(10), 1)
            # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior!
            posterior_spam = prior_spam * product_of_all_conditional_probs_spam


            product_of_all_conditional_probs_ham = reduce(lambda acc,cur: acc * stats.laplace(sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf(row[CASE_2_ATTRIBUTE_INDEXES[cur]]) , xrange(10), 1)
            posterior_ham = prior_ham * product_of_all_conditional_probs_ham
    
            # whichever is greater - that will be our prediction
            if posterior_spam > posterior_ham:
                guess = 1
            else:
                guess = 0

            if(row[SPAM_ATTR_INDEX] == guess):
                hits += 1
            else:
                misses += 1

            # we'll use these to calculate metrics
            if (row[SPAM_ATTR_INDEX] == 1 ):
                is_spam += 1
                
                if guess == 1:
                    guessed_spam += 1
                    correctly_is_spam += 1
                else:
                    guessed_ham += 1
            else:
                is_ham += 1

                if guess == 1:
                    guessed_spam += 1
                else:
                    guessed_ham += 1
                    correctly_is_ham += 1

        #accuracy = number of correctly evaluated instances/
        #           number of instances
        #
        #
        accuracy = hits/(hits+misses)


        #precision_spam = number of correctly evaluated instances as spam/
        #            number of spam instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(is_spam == 0):
            precision_spam = 0
        else:
            precision_spam = correctly_is_spam/is_spam

        #recall_spam = number of correctly evaluated instances as spam/
        #         number of evaluated instances como spam
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_spam == 0):
            recall_spam = 0
        else:
            recall_spam = correctly_is_spam/guessed_spam

        #precision_ham = number of correctly evaluated instances as ham/
        #            number of ham instances
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(is_ham == 0):
            precision_ham = 0
        else:
            precision_ham = correctly_is_ham/is_ham

        #recall_ham = number of correctly evaluated instances as ham/
        #         number of evaluated instances como ham
        #
        #
        # in order to avoid divisions by zero in case nothing was found
        if(guessed_ham == 0):
            recall_ham = 0
        else:
            recall_ham = correctly_is_ham/guessed_ham


        accuracy_in_each_turn.append(accuracy)

        precision_in_each_turn_spam.append(precision_spam)
        recall_in_each_turn_spam.append(recall_spam)

        precision_in_each_turn_ham.append(precision_ham)
        recall_in_each_turn_ham.append(recall_ham)

    # calculation of means for each metric at the end

    mean_accuracy = np.mean(accuracy_in_each_turn)
    std_dev_accuracy = np.std(accuracy_in_each_turn)
    variance_accuracy = np.var(accuracy_in_each_turn)

    mean_precision_spam = np.mean(precision_in_each_turn_spam)
    std_dev_precision_spam = np.std(precision_in_each_turn_spam)
    variance_precision_spam = np.var(precision_in_each_turn_spam)

    mean_recall_spam = np.mean(recall_in_each_turn_spam)
    std_dev_recall_spam = np.std(recall_in_each_turn_spam)
    variance_recall_spam = np.var(recall_in_each_turn_spam)

    mean_precision_ham = np.mean(precision_in_each_turn_ham)
    std_dev_precision_ham = np.std(precision_in_each_turn_ham)
    variance_precision_ham = np.var(precision_in_each_turn_ham)

    mean_recall_ham = np.mean(recall_in_each_turn_ham)
    std_dev_recall_ham = np.std(recall_in_each_turn_ham)
    variance_recall_ham = np.var(recall_in_each_turn_ham)    

    if output:
        print "\033[1;32m"
        print '============================================='
        print 'CASE 3 - ALL ATTRIBUTES - USING LAPLACE MODEL'
        print '============================================='
        print "\033[00m"
        print 'MEAN ACCURACY: '+str(round(mean_accuracy,5))
        print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5))
        print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8))
        print ''
        print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5))
        print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5))
        print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8))
        print ''
        print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5))
        print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5))
        print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8))
        print ''
        print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5))
        print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5))
        print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8))
        print ''
        print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5))
        print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5))
        print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8))   
Пример #39
0
def CI_sampler_regressor(X_in,
                         Y_in,
                         Z_in,
                         train_len=-1,
                         nthread=4,
                         max_depth=6,
                         colsample_bytree=0.8,
                         n_estimators=200,
                         noise='Normal',
                         perc=0.3):
    np.random.seed(11)
    assert (type(X_in) == np.ndarray), "Not an array"
    assert (type(Y_in) == np.ndarray), "Not an array"
    assert (type(Z_in) == np.ndarray), "Not an array"

    nx, dx = X_in.shape
    ny, dy = Y_in.shape
    nz, dz = Z_in.shape

    assert (nx == ny), "Dimension Mismatch"
    assert (nz == ny), "Dimension Mismatch"
    assert (nx == nz), "Dimension Mismatch"

    samples = np.hstack([X_in, Y_in, Z_in]).astype(np.float32)

    if train_len == -1:
        train_len = 2 * len(X_in) / 3

    assert (train_len <
            nx), "Training length cannot be larger than total length"

    data1 = samples[0:nx / 2, :]
    data2 = samples[nx / 2::, :]

    multioutputregressor = MultiOutputRegressor(
        estimator=xgb.XGBRegressor(objective='reg:linear',
                                   max_depth=max_depth,
                                   colsample_bytree=1.0,
                                   n_estimators=n_estimators,
                                   nthread=nthread))

    Xset = range(0, dx)
    Yset = range(dx, dx + dy)
    Zset = range(dx + dy, dx + dy + dz)

    X1, Y1, Z1 = data1[:, Xset], data1[:, Yset], data1[:, Zset]
    X2, Y2, Z2 = data2[:, Xset], data2[:, Yset], data2[:, Zset]

    if noise == 'Normal':
        MOR = multioutputregressor.fit(Z1, Y1)
        Y1hat = MOR.predict(Z1)
        cov = np.cov(np.transpose(Y1hat - Y1))

        print('Calculated Covariance: ')
        print(cov)

        Yprime = MOR.predict(Z2)
        n2, n22 = data2.shape
        try:
            m1, m2 = cov.shape
            Nprime = np.random.multivariate_normal(np.zeros(m1), cov, size=n2)
        except:
            m1 = 1
            Nprime = np.random.normal(scale=np.sqrt(cov), size=[n2, 1])

    elif noise == 'Laplace':
        MOR = multioutputregressor.fit(Z1, Y1)
        Y1hat = MOR.predict(Z1)
        E = Y1 - Y1hat
        Yprime = MOR.predict(Z2)
        n2, n22 = data2.shape
        p, q = E.shape
        s = np.std(E[:, 0])
        L = laplace()
        r = L.rvs(size=(n2, 1))
        s2 = np.std(r)
        r = (s / s2) * r
        Nprime = r

        for l in range(1, q):
            s = np.std(E[:, l])
            L = laplace()
            r = L.rvs(size=(n2, 1))
            s2 = np.std(r)
            r = (s / s2) * r
            Nprime = np.vstack((Nprime, r))
    elif noise == 'Mixture':
        MOR = multioutputregressor.fit(Z1, Y1)
        Y1hat = MOR.predict(Z1)
        cov = np.cov(np.transpose(Y1hat - Y1))

        print('Calculated Covariance: ')
        print(cov)

        Yprime = MOR.predict(Z2)
        n2, n22 = data2.shape
        try:
            m1, m2 = cov.shape
            Nprime = np.random.multivariate_normal(np.zeros(m1), cov, size=n2)
        except:
            m1 = 1
            NprimeG = np.random.normal(scale=np.sqrt(cov), size=[n2, 1])

        MOR = multioutputregressor.fit(Z1, Y1)
        Y1hat = MOR.predict(Z1)
        E = Y1 - Y1hat
        Yprime = MOR.predict(Z2)
        n2, n22 = data2.shape
        p, q = E.shape
        s = np.std(E[:, 0])
        L = laplace()
        r = L.rvs(size=(n2, 1))
        s2 = np.std(r)
        r = (s / s2) * r
        Nprime = r

        for l in range(1, q):
            s = np.std(E[:, l])
            L = laplace()
            r = L.rvs(size=(n2, 1))
            s2 = np.std(r)
            r = (s / s2) * r
            Nprime = np.vstack((Nprime, r))
            indices = np.random.choice(p, size=int(perc * p), replace=False)
            Nprime[indices, :] = NprimeG[indices, :]

    else:
        assert False, 'Not Implemented Error'

    yprime = Yprime + Nprime

    data2_new = np.hstack([X2, yprime, Z2])

    y1 = np.ones([len(data1), 1])
    y2 = np.zeros([len(data2_new), 1])

    at1 = np.hstack([data1, y1])
    at2 = np.hstack([data2_new, y2])

    all_train = np.vstack([at1, at2])

    shuffle = np.random.permutation(len(all_train))
    data_final = all_train[shuffle, :]
    l, m = data_final.shape
    Xdata = data_final[:, 0:m - 1]
    Ydata = data_final[:, m - 1]

    Xtrain = Xdata[0:train_len, :]
    Ytrain = Ydata[0:train_len]

    Xtest = Xdata[train_len::, :]
    Ytest = Ydata[train_len::]

    return Xtrain, Ytrain, Xtest, Ytest
plt.axis([-20, 20, 0, 0.10])
plt.text(-18,0.08,'n=10000')

plt.subplot(2,2,4)
x = rv.rvs(size=100000)
n, bins, patches = plt.hist(x, 20, normed=1, facecolor='magenta', alpha=0.5)
plt.plot(x1, y1, 'r', lw=3)
plt.xlabel('X', fontsize=15)
plt.ylabel('PDF', fontsize=15)
plt.axis([-20, 20, 0, 0.10])
plt.text(-18,0.08,'n=100000')

plt.savefig('/home/tomer/my_books/python_in_hydrology/images/rand_theo.png')

# LAPLACE DISTRIBUION
rv = st.laplace(loc=0, scale=15)

x1 = np.linspace(-100, 100, 1000)
y1 = rv.pdf(x1)

# compute and plot pdf
plt.clf()
fig = plt.figure()
fig.subplots_adjust(wspace=0.4)

plt.subplot(2,2,1)
x = rv.rvs(size=100)
n, bins, patches = plt.hist(x, 20, normed=1, facecolor='yellow', alpha=0.5)
plt.plot(x1, y1, 'r', lw=3, label='scale=5')
plt.xlabel('X', fontsize=15)
plt.ylabel('PDF', fontsize=15)
Пример #41
0
def plot_result(color, distrib, x_a=-4, x_b=4):
    x = 0
    y = 0
    x_dist = np.linspace(x_a, x_b, 3000)
    if distrib == "Standard normal":
        y = st.norm.cdf(x_dist, 0, 1)
        y_p = st.norm.pdf(x_dist, 0, 1)
        x = st.norm(loc=0., scale=1.)
    elif distrib == "Uniform":
        y_p = st.uniform.pdf(x_dist, -3**0.5, 2 * (3**0.5))
        y = st.uniform.cdf(x_dist, -3**0.5, 2 * (3**0.5))
        x = st.uniform(loc=-3**0.5, scale=2 * (3**0.5))
    elif distrib == "Cauchy":
        y = st.cauchy.cdf(x_dist, 0, 1)
        y_p = st.cauchy.pdf(x_dist, 0, 1)
        x = st.cauchy(loc=0, scale=1)
    elif distrib == "Laplace":
        y = st.laplace.cdf(x_dist, 0, (2**(-0.5)))
        y_p = st.laplace.pdf(x_dist, 0, (2**(-0.5)))
        x = st.laplace(loc=0, scale=(2**(-0.5)))
    elif distrib == "Poisson":
        y = st.poisson.cdf(x_dist, mu=2)
        y_p = np.exp(-2) * np.power(2, x_dist) / factorial(x_dist)
        x = st.poisson(mu=2)

    for i in n_vec:
        sample_x = np.sort(x.rvs(i))

        cdf = ecdf(sample_x)

        plt.step(sample_x,
                 cdf,
                 color="darkslategrey",
                 label="Empirical distribution function")
        plt.plot(x_dist, y, color=color)

        plt.legend()
        plt.xlim(x_a, x_b)
        plt.tight_layout()
        plt.show()

    for i in n_vec:
        sample_x = np.sort(x.rvs(i))

        fig = plt.figure(num="ker_" + str(distrib) + "_" + str(i),
                         figsize=(11, 4))
        ind = 1
        for h in [1, 3, 5]:
            ax = fig.add_subplot(1, 3, ind)
            sns.kdeplot(sample_x,
                        color="darkslategrey",
                        bw=h,
                        ax=ax,
                        label=("Kernel density estimation" if h == 1 else ""))
            if h == 5:
                ax.legend(["Kernel density estimation"],
                          loc="upper center",
                          bbox_to_anchor=(0.5, -0.25))

            plt.plot(x_dist, y_p, color=color)

            plt.xlim(x_a, x_b)
            plt.title("h = " + str(h))
            ind = ind + 1

        plt.tight_layout()
        plt.show()
Пример #42
0
from scipy.stats import norm, cauchy, laplace, poisson, uniform
import numpy as np
import matplotlib.pyplot as plt


def quart(list, p):
    idx = int(np.ceil(len(list) * p))
    return list[idx]


sampleSizes = [20, 100]
distrs = [(norm(0, 1), "normal"), (cauchy(0, 1), "cauchy"),
          (laplace(0, 1 / np.sqrt(2)), "laplace"), (poisson(10), "poisson"),
          (uniform(-np.sqrt(3), 2 * np.sqrt(3)), "uniform")]

for distr in distrs:
    fig, axs = plt.subplots(2)
    for i in range(len(sampleSizes)):
        rvs = distr[0].rvs(sampleSizes[i])
        rvs.sort()
        axs[i].boxplot(rvs, vert=False)
        axs[i].set_ylabel(str(sampleSizes[i]), fontsize=8)

    fig.savefig('Boxplots/' + distr[1] + '.png')

table = open("TablesLab3_1.tex", 'w', encoding="utf-8")
table.writelines("\\begin{table}[h]\n"
                 "\centering\n"
                 "\\begin{tabular}{ |" + "c|" * 2 + " }\n"
                 "\hline\n"
                 "Выборка & Доля выбросов \\\\\n"
Пример #43
0
ax.set_xlim(-5, 5)
ax.set_ylim(0, 0.7001)
ax.set_ylabel("$p(x)$", fontsize=16)
ax.xaxis.set_major_formatter(plt.NullFormatter())

# trick to show multiple legends
leg1 = ax.legend([l1], [l1.get_label()], loc=1)
leg2 = ax.legend([l2, l3], (l2.get_label(), l3.get_label()), loc=2)
ax.add_artist(leg1)
ax.set_title("Skew $\Sigma$ and Kurtosis $K$")

# next show distributions with different kurtosis
ax = fig.add_subplot(212)
x = np.linspace(-5, 5, 1000)
l1, = ax.plot(x, stats.laplace(0, 1).pdf(x), "--k", label=r"${\rm Laplace,}\ K=+3$")
l2, = ax.plot(x, stats.norm(0, 1).pdf(x), "-k", label=r"${\rm Gaussian,}\ K=0$")
l3, = ax.plot(x, stats.cosine(0, 1).pdf(x), "-.k", label=r"${\rm Cosine,}\ K=-0.59$")
l4, = ax.plot(x, stats.uniform(-2, 4).pdf(x), ":k", label=r"${\rm Uniform,}\ K=-1.2$")

ax.set_xlim(-5, 5)
ax.set_ylim(0, 0.6001)
ax.set_xlabel("$x$", fontsize=16)
ax.set_ylabel("$p(x)$", fontsize=16)

# trick to show multiple legends
leg1 = ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=2)
leg2 = ax.legend((l3, l4), (l3.get_label(), l4.get_label()), loc=1)
ax.add_artist(leg1)

plt.show()
Пример #44
0
def all_dists():
    # dists param were taken from scipy.stats official
    # documentaion examples
    # Total - 89
    return {
        "alpha":
        stats.alpha(a=3.57, loc=0.0, scale=1.0),
        "anglit":
        stats.anglit(loc=0.0, scale=1.0),
        "arcsine":
        stats.arcsine(loc=0.0, scale=1.0),
        "beta":
        stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0),
        "betaprime":
        stats.betaprime(a=5, b=6, loc=0.0, scale=1.0),
        "bradford":
        stats.bradford(c=0.299, loc=0.0, scale=1.0),
        "burr":
        stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0),
        "cauchy":
        stats.cauchy(loc=0.0, scale=1.0),
        "chi":
        stats.chi(df=78, loc=0.0, scale=1.0),
        "chi2":
        stats.chi2(df=55, loc=0.0, scale=1.0),
        "cosine":
        stats.cosine(loc=0.0, scale=1.0),
        "dgamma":
        stats.dgamma(a=1.1, loc=0.0, scale=1.0),
        "dweibull":
        stats.dweibull(c=2.07, loc=0.0, scale=1.0),
        "erlang":
        stats.erlang(a=2, loc=0.0, scale=1.0),
        "expon":
        stats.expon(loc=0.0, scale=1.0),
        "exponnorm":
        stats.exponnorm(K=1.5, loc=0.0, scale=1.0),
        "exponweib":
        stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0),
        "exponpow":
        stats.exponpow(b=2.7, loc=0.0, scale=1.0),
        "f":
        stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0),
        "fatiguelife":
        stats.fatiguelife(c=29, loc=0.0, scale=1.0),
        "fisk":
        stats.fisk(c=3.09, loc=0.0, scale=1.0),
        "foldcauchy":
        stats.foldcauchy(c=4.72, loc=0.0, scale=1.0),
        "foldnorm":
        stats.foldnorm(c=1.95, loc=0.0, scale=1.0),
        # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0),
        # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0),
        "genlogistic":
        stats.genlogistic(c=0.412, loc=0.0, scale=1.0),
        "genpareto":
        stats.genpareto(c=0.1, loc=0.0, scale=1.0),
        "gennorm":
        stats.gennorm(beta=1.3, loc=0.0, scale=1.0),
        "genexpon":
        stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0),
        "genextreme":
        stats.genextreme(c=-0.1, loc=0.0, scale=1.0),
        "gausshyper":
        stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0),
        "gamma":
        stats.gamma(a=1.99, loc=0.0, scale=1.0),
        "gengamma":
        stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0),
        "genhalflogistic":
        stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0),
        "gilbrat":
        stats.gilbrat(loc=0.0, scale=1.0),
        "gompertz":
        stats.gompertz(c=0.947, loc=0.0, scale=1.0),
        "gumbel_r":
        stats.gumbel_r(loc=0.0, scale=1.0),
        "gumbel_l":
        stats.gumbel_l(loc=0.0, scale=1.0),
        "halfcauchy":
        stats.halfcauchy(loc=0.0, scale=1.0),
        "halflogistic":
        stats.halflogistic(loc=0.0, scale=1.0),
        "halfnorm":
        stats.halfnorm(loc=0.0, scale=1.0),
        "halfgennorm":
        stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0),
        "hypsecant":
        stats.hypsecant(loc=0.0, scale=1.0),
        "invgamma":
        stats.invgamma(a=4.07, loc=0.0, scale=1.0),
        "invgauss":
        stats.invgauss(mu=0.145, loc=0.0, scale=1.0),
        "invweibull":
        stats.invweibull(c=10.6, loc=0.0, scale=1.0),
        "johnsonsb":
        stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0),
        "johnsonsu":
        stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0),
        "ksone":
        stats.ksone(n=1e03, loc=0.0, scale=1.0),
        "kstwobign":
        stats.kstwobign(loc=0.0, scale=1.0),
        "laplace":
        stats.laplace(loc=0.0, scale=1.0),
        "levy":
        stats.levy(loc=0.0, scale=1.0),
        "levy_l":
        stats.levy_l(loc=0.0, scale=1.0),
        "levy_stable":
        stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0),
        "logistic":
        stats.logistic(loc=0.0, scale=1.0),
        "loggamma":
        stats.loggamma(c=0.414, loc=0.0, scale=1.0),
        "loglaplace":
        stats.loglaplace(c=3.25, loc=0.0, scale=1.0),
        "lognorm":
        stats.lognorm(s=0.954, loc=0.0, scale=1.0),
        "lomax":
        stats.lomax(c=1.88, loc=0.0, scale=1.0),
        "maxwell":
        stats.maxwell(loc=0.0, scale=1.0),
        "mielke":
        stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0),
        "nakagami":
        stats.nakagami(nu=4.97, loc=0.0, scale=1.0),
        "ncx2":
        stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0),
        "ncf":
        stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0),
        "nct":
        stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0),
        "norm":
        stats.norm(loc=0.0, scale=1.0),
        "pareto":
        stats.pareto(b=2.62, loc=0.0, scale=1.0),
        "pearson3":
        stats.pearson3(skew=0.1, loc=0.0, scale=1.0),
        "powerlaw":
        stats.powerlaw(a=1.66, loc=0.0, scale=1.0),
        "powerlognorm":
        stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0),
        "powernorm":
        stats.powernorm(c=4.45, loc=0.0, scale=1.0),
        "rdist":
        stats.rdist(c=0.9, loc=0.0, scale=1.0),
        "reciprocal":
        stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0),
        "rayleigh":
        stats.rayleigh(loc=0.0, scale=1.0),
        "rice":
        stats.rice(b=0.775, loc=0.0, scale=1.0),
        "recipinvgauss":
        stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0),
        "semicircular":
        stats.semicircular(loc=0.0, scale=1.0),
        "t":
        stats.t(df=2.74, loc=0.0, scale=1.0),
        "triang":
        stats.triang(c=0.158, loc=0.0, scale=1.0),
        "truncexpon":
        stats.truncexpon(b=4.69, loc=0.0, scale=1.0),
        "truncnorm":
        stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0),
        "tukeylambda":
        stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0),
        "uniform":
        stats.uniform(loc=0.0, scale=1.0),
        "vonmises":
        stats.vonmises(kappa=3.99, loc=0.0, scale=1.0),
        "vonmises_line":
        stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0),
        "wald":
        stats.wald(loc=0.0, scale=1.0),
        "weibull_min":
        stats.weibull_min(c=1.79, loc=0.0, scale=1.0),
        "weibull_max":
        stats.weibull_max(c=2.87, loc=0.0, scale=1.0),
        "wrapcauchy":
        stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0),
    }
Пример #45
0
 def _kstest(self, loc, scale, samples):
   # Uses the Kolmogorov-Smirnov test for goodness of fit.
   ks, _ = stats.kstest(samples, stats.laplace(loc, scale=scale).cdf)
   # Return True when the test passes.
   return ks < 0.02
Пример #46
0
    sstot = np.sum((y - ybar)**2)
    r2 = ssreg / sstot

    plt.plot(x_n, ffit, label='order {}, $R^2$= {:.2f}'.format(i, r2))

plt.legend(loc=2, fontsize=14)
plt.xlabel('$x$', fontsize=14)
plt.ylabel('$y$', fontsize=14, rotation=0)
plt.savefig('img602.png', dpi=300, figsize=[5.5, 5.5])

plt.figure()

plt.figure(figsize=(8, 6))
x_values = np.linspace(-10, 10, 300)
for df in [1, 2, 5, 15]:
    distri = stats.laplace(scale=df)
    x_pdf = distri.pdf(x_values)
    plt.plot(x_values, x_pdf, label='$b$ = {}'.format(df))

x_pdf = stats.norm.pdf(x_values)
plt.plot(x_values, x_pdf, label='Gaussian')
plt.xlabel('x')
plt.ylabel('p(x)', rotation=0)
plt.legend(loc=0, fontsize=14)
plt.xlim(-7, 7)
plt.savefig('img603.png', dpi=300, figsize=[5.5, 5.5])

plt.figure()

x_1 = np.array([10., 8., 13., 9., 11., 14., 6., 4., 12., 7., 5.])
y_1 = np.array(
Пример #47
0
ax.set_xlim(-5, 5)
ax.set_ylim(0, 0.7001)
ax.set_ylabel('$p(x)$')
ax.xaxis.set_major_formatter(plt.NullFormatter())

# trick to show multiple legends
leg1 = ax.legend([l1], [l1.get_label()], loc=1)
leg2 = ax.legend([l2, l3], (l2.get_label(), l3.get_label()), loc=2)
ax.add_artist(leg1)
ax.set_title('Skew $\Sigma$ and Kurtosis $K$')

# next show distributions with different kurtosis
ax = fig.add_subplot(212)
x = np.linspace(-5, 5, 1000)
l1, = ax.plot(x, stats.laplace(0, 1).pdf(x), '--k',
              label=r'${\rm Laplace,}\ K=+3$')
l2, = ax.plot(x, stats.norm(0, 1).pdf(x), '-k',
              label=r'${\rm Gaussian,}\ K=0$')
l3, = ax.plot(x, stats.cosine(0, 1).pdf(x), '-.k',
              label=r'${\rm Cosine,}\ K=-0.59$')
l4, = ax.plot(x, stats.uniform(-2, 4).pdf(x), ':k',
              label=r'${\rm Uniform,}\ K=-1.2$')

ax.set_xlim(-5, 5)
ax.set_ylim(0, 0.55)
ax.set_xlabel('$x$')
ax.set_ylabel('$p(x)$')

# trick to show multiple legends
leg1 = ax.legend((l1, l2), (l1.get_label(), l2.get_label()), loc=2)
def prepare_widgets():
    print "initializing..."
    # start bokeh-server session
    global client
    client = Session(root_url='http://0.0.0.0:7010/', load_from_config=False)
    try:
        client.register(bs_login,bs_password)
    except:pass

    client.login(bs_login,bs_password)

    ###CREATE WIDGETS
    print "preaparing widgets..."
    #hist1: hist with overlay
    import analysis.distfit as distfit
    import pandas as pd
    xname,xmin,xmax,xbins = "invariantMass",0,10,50


    bin_separators = np.histogram([],bins=xbins, range=[xmin,xmax])[1]
    bin_centers = np.array([0.5*(bin_separators[i]+bin_separators[i+1]) for i in range(len(bin_separators)-1)])
    bins = pd.DataFrame({"x":bin_centers})

    expo_gap = lambda x,slope: (x>=xmin)*(x<=xmax)*distfit.exponential(x-xmin,slope)/(1.-np.e**(-(xmax-xmin)*slope))
    mix_model = distfit.DistributionsMixture(
        distributions={'sig': distfit.gauss, 'bck': expo_gap},
        weights_ranges={'sig': [1.,10.], 'bck': [1.,10.]},
        parameter_ranges={'mean': [xmin ,xmax], 'sigma': [0., xmax-xmin], 'slope': [0, 15.]},
        column_ranges={'x': [xmin, xmax]},
        sampling_strategy='grid',
    )


    mix_model.compile(bins,1000) #takes several seconds



    hist1_base = WhiskeredHistWidget(xname,xmin,xmax,xbins,es,
                       fig = figure(plot_width=600, plot_height=600,tools=['wheel_zoom','ywheel_zoom','pan','resize','reset']))
    hist1 = MLFitOverlayWidget(hist1_base,mix_model,n_pts=100)
    widgets.append(hist1)

    #hist2: just hist
    hist2 = ClassicHistWidget("muonHits",0,100,30,es,
                       fig = figure(plot_width=600, plot_height=600,tools=['wheel_zoom','ywheel_zoom','pan','reset']))
    widgets.append(hist2)


    #hist3: heatmap
    hist3 = HeatmapWidget("avgMass",0,35,50,
                          "muonHits",0,70,50,
                          es,fig = figure(plot_width=600, plot_height=600),)
    widgets.append(hist3)

    #hist4: hist with reference
    hist4_base = ClassicHistWidget("dskkpi",1920,2020,30,es,
                       fig = figure(plot_width=600, plot_height=600,tools=['wheel_zoom','ywheel_zoom','pan','reset']))
    
    from scipy.stats import laplace
    pdf = laplace(1970,7).pdf
    hist4 = ReferenceOverlay(hist4_base,pdf)
    widgets.append(hist4)    
    
    ###end CREATE PLOTS

    print "publishing plots..."
    #create a dashboard on bokeh_server
    output_server(dashboard_name,client)

    plots = [ hplot(widget.fig) for widget in widgets ]

    global whole_dashboard
    whole_dashboard = vplot(hplot(*plots[:2]),hplot(*plots[2:]))
    plots.append(whole_dashboard)    

    for plot in plots:
        client.show(plot)


    client.publish()
 
    print "creating static links..."
    #publish the thing
    from bokeh.embed import autoload_server
    scripts = [autoload_server(plot,client,public=True) for plot in plots]


    
    print "saving widget scripts..."
    
    #remove previous widgets
    for path_to_static in path_to_django_static,path_to_flask_static:
        path_to_widgets = os.path.join(path_to_static,dashboard_name)

        os.system("rm -rf " + path_to_widgets)
        os.mkdir(path_to_widgets)

        for i, source_script in enumerate(scripts):
            
            #convert script...
            script = assemble_script("widget"+str(i),source_script)
            
            with open("{}/widget{}.html".format(path_to_widgets,i),'w') as fscript:
                fscript.write(script)
    

    print "dashboard {} ready.".format(dashboard_name),