Exemplo n.º 1
0
def sample_size_chi2(y, x, pars):
    alpha = pars['alpha']
    m, n = x.shape
    x = np.squeeze(np.asarray(x))
    y = np.squeeze(np.asarray(y))
    nCl = len(np.unique(np.asarray(y)))
        
    s_size = m
    N = round(pow(m, 1/3))
    nBinsArray = [nCl]
    nBinsArray.extend([N]*n)
    p, edges = np.histogramdd((y, x), bins=nBinsArray)
    idx = range(0,m)
    
    chi_min = chi2.ppf(alpha/2, 1.5*N)
    chi_max = chi2.ppf(1-alpha/2, 1.5*N)
    while s_size > N:
        chi2d = []
        for nIter in range(0, 50):
            random.shuffle(idx)
            idx1 = idx[:s_size]
            random.shuffle(idx)
            idx2 = idx[:s_size]
            chi2d.append(chi2div(y, x, idx1, idx2, edges))
        chi2d = sum(chi2d)/nIter    
        if chi2d < chi_min or chi2d > chi2_max:
            s_size += 1
            break
        s_size -= 1    

    

    return s_size    
def calc_lambduh(num_flds, side_len):
    lower_bound = .5*chi2.ppf(ALPHA/2.0,2*sum(num_flds))/len(num_flds)
    upper_bound = .5*chi2.ppf(1-ALPHA/2.0,2*sum(num_flds)+2)/len(num_flds)

    lambduh = 1.0*np.average(num_flds)/(side_len**2)

    lambduh_bound = [lower_bound/side_len**2, upper_bound/side_len**2]
    area = side_len**2
    return lambduh, lambduh_bound, area
Exemplo n.º 3
0
    def ci_var(self, lower_bound=None, upper_bound=None, sig=.05):
        """
        Returns the confidence interval for the variance.

        Parameters
        ----------
        lower_bound : float
            The minimum value the lower confidence interval can
            take. The p-value from test_var(lower_bound) must be lower
            than 1 - significance level. Default is .99 confidence
            limit assuming normality

        upper_bound : float
            The maximum value the upper confidence interval
            can take. The p-value from test_var(upper_bound) must be lower
            than 1 - significance level.  Default is .99 confidence
            limit assuming normality

        sig : float
            The significance level. Default is .05

        Returns
        -------
        Interval : tuple
            Confidence interval for the variance

        Examples
        --------
        >>> import numpy as np
        >>> import statsmodels.api as sm
        >>> random_numbers = np.random.standard_normal(100)
        >>> el_analysis = sm.emplike.DescStat(random_numbers)
        >>> el_analysis.ci_var()
        (0.7539322567470305, 1.229998852496268)
        >>> el_analysis.ci_var(.5, 2)
        (0.7539322567469926, 1.2299988524962664)

        Notes
        -----
        If the function returns the error f(a) and f(b) must have
        different signs, consider lowering lower_bound and raising
        upper_bound.
        """
        endog = self.endog
        if upper_bound is None:
            upper_bound = ((self.nobs - 1) * endog.var()) / \
              (chi2.ppf(.0001, self.nobs - 1))
        if lower_bound is None:
            lower_bound = ((self.nobs - 1) * endog.var()) / \
              (chi2.ppf(.9999, self.nobs - 1))
        self.r0 = chi2.ppf(1 - sig, 1)
        llim = optimize.brentq(self._ci_limits_var, lower_bound, endog.var())
        ulim = optimize.brentq(self._ci_limits_var, endog.var(), upper_bound)
        return llim, ulim
Exemplo n.º 4
0
def poisson_interval(k, alpha=0.05):
    """
    uses chisquared info to get the poisson interval. Uses scipy.stats
    (imports in function).
    """
    from scipy.stats import chi2

    a = alpha
    low, high = (chi2.ppf(a / 2, 2 * k) / 2, chi2.ppf(1 - a / 2, 2 * k + 2) / 2)
    if k == 0:
        low = 0.0
    return low, high
Exemplo n.º 5
0
def chi2_distribution():
    fig, ax = plt.subplots(1, 1)
    #display the probability density function
    df = 10
    x=np.linspace(chi2.ppf(0.01, df), chi2.ppf(0.99, df), 100)
    ax.plot(x, chi2.pdf(x,df))
    
    #simulate the chi2 distribution
    y = []
    n=10
    for i in range(1000):
        chi2r=0.0
        r = norm.rvs(size=n)
        for j in range(n):
            chi2r=chi2r+r[j]**2
        y.append(chi2r)

    ax.hist(y, normed=True, alpha=0.2) 
    plt.show()
    
    fig, ax = plt.subplots(1, 1)
    #display the probability density function
    df = 10
    x=np.linspace(-4, 4, 100)
    ax.plot(x, t.pdf(x,df))
    
    #simulate the t-distribution
    y = []
    for i in range(1000):
        rx = norm.rvs()
        ry = chi2.rvs(df)
        rt = rx/np.sqrt(ry/df)
        y.append(rt)

    ax.hist(y, normed=True, alpha=0.2)
    plt.show()
    
    fig, ax = plt.subplots(1, 1)
    #display the probability density function
    dfn, dfm = 10, 5
    x = np.linspace(f.ppf(0.01, dfn, dfm), f.ppf(0.99, dfn, dfm), 100)
    ax.plot(x, f.pdf(x, dfn, dfm))
    
    #simulate the F-distribution
    y = []
    for i in range(1000):
        rx = chi2.rvs(dfn)
        ry = chi2.rvs(dfm)
        rf = np.sqrt(rx/dfn)/np.sqrt(ry/dfm)
        y.append(rf)

    ax.hist(y, normed=True, alpha=0.2)
    plt.show()
Exemplo n.º 6
0
def get_lambda(p, definition = 'median'):
    '''
    Evaluates Lambda value
    :param p: distribution of p-values
    :param definition: definition of lambda
    :return:
    '''
    if definition == 'median':
        pm = np.median(p)
        Chi = chi2.ppf(1. - pm, 1)
        return Chi / chi2.ppf(0.5,1)
    else:
        raise Exception("Only 'median' definition of lambda is implemented at this moment.")
Exemplo n.º 7
0
def plot_profile_1d(data, likes, minval, maxval, xlabel='Parameter value'):
    bins, PL = profile_1d(data, likes, minval, maxval)

    pl.figure()
    pl.plot(bins[1:] - 0.5*(bins[1] - bins[0]), PL, 'k-', linewidth=2.0)
    pl.ylim([0,10])

    pl.axhline(0.5*chi2.ppf(0.683, 1), color='b', linestyle='--')
    pl.axhline(0.5*chi2.ppf(0.995, 1), color='b', linestyle='--')

    pl.text(2.7, 0.5*chi2.ppf(0.683, 1), r'$1\sigma$')
    pl.text(2.7, 0.5*chi2.ppf(0.995, 1), r'$2\sigma$')

    pl.xlabel(xlabel)
    pl.ylabel(r'$\Delta log \mathcal{L}$')
    def predicate(cls, tasks, user_id, cost):
        if len(tasks) < 3:
            return None, None, None, None, None

        # use only same user tasks?
        same_user_tasks = filter_user_id(tasks, user_id)
        if len(same_user_tasks) > 3:
            tasks = same_user_tasks

        # use only same cost tasks?
        same_cost_tasks = filter_cost(tasks, cost)
        if len(same_cost_tasks) > 3:
            tasks = same_cost_tasks

        # use only last N tasks
        tasks = tasks[-8:]

        sample = np.array([x['actualWorkTime'] / x['cost'] for x in tasks])
        n = sample.size
        mu = np.mean(sample)
        s2 = np.var(sample, ddof=1)

        t45 = sci_t.ppf(0.95, n - 1)
        mlow, mhigh = mu + np.array([-t45, t45]) * (np.sqrt(s2) / np.sqrt(n))

        chi45a = sci_chi2.ppf(0.95, n - 1)
        shigh = np.sqrt((n - 1) * s2 / chi45a)

        low, high = mlow - shigh, mhigh + shigh

        return (mlow + mhigh) / 2 * cost, mlow * cost, mhigh * cost, low * cost, high * cost
Exemplo n.º 9
0
def chi2_2sample_crit(alpha, df):
    """
    @param alpha confidence level
    @param df degrees of freedom
    """
    crit = chi2.ppf(1.0-alpha, df)
    return crit
Exemplo n.º 10
0
    def _quantile_notTruncated(self, q, tol=1.e-6):
        """
        Compute the quantile for the non truncated distribution

        Parameters
        ----------
        q : float
            quantile you want to compute. Between 0 and 1

        tol : float
            precision for the output

        Returns
        -------
        x : float
            x such that P(X < x) = q

        """
        scale = self._scale
        k = self._k
        dps = self._dps
        
        z_approx = scale * chi2.ppf(q, k)
        
        epsilon = scale * 0.001
        lb = z_approx - epsilon
        ub = z_approx + epsilon

        f = lambda z: self._cdf_notTruncated(-np.inf, z, dps)

        z = find_root(f, q, lb, ub, tol)

        return z 
Exemplo n.º 11
0
def plot_cov_ellipse(cov, pos, volume=.5, ax=None, fc='none', ec=[0,0,0], a=1, lw=2):
    
    """ Helper Method: draw ellipse of gaussian mixture"""
    
    import numpy as np
    from scipy.stats import chi2
    import matplotlib.pyplot as plt
    from matplotlib.patches import Ellipse

    def eigsorted(cov):
        vals, vecs = np.linalg.eigh(cov)
        order = vals.argsort()[::-1]
        return vals[order], vecs[:,order]

    if ax is None:
        ax = plt.gca()

    vals, vecs = eigsorted(cov)
    theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))

    kwrg = {'facecolor':fc, 'edgecolor':ec, 'alpha':a, 'linewidth':lw}

    # Width and height are "full" widths, not radius
    width, height = 2 * np.sqrt(chi2.ppf(volume,2)) * np.sqrt(vals)
    ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwrg)

    ax.add_artist(ellip)
Exemplo n.º 12
0
 def GlobalTest(self,T,n):
     Tcrit=chi2.ppf(0.05,n)
     if (T>Tcrit):
         GTResult=True
     else:
         GTResult=False
     return GTResult
Exemplo n.º 13
0
def generate_emissions(theta,data):
  def _eigsorted(cov):
    vals,vecs = np.linalg.eigh(cov)
    order = vals.argsort()[::-1]
    return vals[order],vecs[:,order]
  xmin = np.inf; xmax = -np.inf
  ymin = np.inf; ymax = -np.inf
  fig = plt.figure()
  ax = fig.add_subplot(111)
  for i in range(len(data)):
    ax.plot(data[i][:,0],data[i][:,1],'k.')
    xmin = np.min([xmin,np.min(data[i][:,0])])
    xmax = np.max([xmax,np.max(data[i][:,0])])
    ymin = np.min([ymin,np.min(data[i][:,1])])
    ymax = np.max([ymax,np.max(data[i][:,1])])

  vol = [0.25,0.5,0.75,0.95, 0.99]

  ell = []
  for i in range(len(theta)):
    pos = theta[i].mean
    vals,vecs = _eigsorted(theta[i].var)
    th = np.degrees(np.arctan2(*vecs[:,0][::-1]))
    for v in vol:
      width,height = 2.0*np.sqrt(chi2.ppf(v,2))*np.sqrt(vals)
      ell.append(Ellipse(xy=pos,width=width,height=height,angle=th))
  for i,e in enumerate(ell):
    ax.add_artist(e)
    e.set_facecolor(my_color_map(i))
    e.set_alpha(0.5)

  ax.set_xlim(xmin, xmax)
  ax.set_ylim(ymin, ymax)
  plt.show()
Exemplo n.º 14
0
def ci_cmle_is(X, v, theta_grid, alpha_level, T = 100, verbose = False):
    cmle_is = np.empty_like(theta_grid)
    r = X.sum(1)
    c = X.sum(0)
    for l, theta_l in enumerate(theta_grid):
        logit_P_l = theta_l * v
        w_l = np.exp(logit_P_l)

        z = cond_a_sample(r, c, w_l, T)
        logf = np.empty(T)
        for t in range(T):
            logQ, logP = z[t][1], z[t][2]
            logf[t] = logP - logQ
        logkappa = -np.log(T) + logsumexp(logf)

        if verbose:
            logcvsq = -np.log(T - 1) - 2 * logkappa + \
              logsumexp(2 * logabsdiffexp(logf, logkappa))
            print 'est. cv^2 = %.2f (T = %d)' % (np.exp(logcvsq), T)

        cmle_is[l] = np.sum(np.log(w_l[X])) - logkappa

    crit = -0.5 * chi2.ppf(1 - alpha_level, 1)
    ci = invert_test(theta_grid, cmle_is - cmle_is.max(), crit)
    if params['plot']:
        plot_statistics(ax_cmle_is, theta_grid, cmle_is - cmle_is.max(), crit)
        cmle_is_coverage_data['cis'].append(ci)
        cmle_is_coverage_data['theta_grid'] = theta_grid
        cmle_is_coverage_data['crit'] = crit
    return ci
Exemplo n.º 15
0
 def __init__(self,image):
     self.Mlist = ('threshold', 'MRF', 'Mathematical Morphology', 'gray MM')
     self.method = self.Mlist[0]
     
     self.img = image
     
     self.nvar = 3
     print 'edit'
     self.param1 = chi2.ppf(0.995, self.nvar)
     img_init = self.img > self.param1
     
     self.param2 = 2
     
     self.fig, ax = plt.subplots(num=20, figsize=(30, 20))
     self.im1 = ax.imshow(img_init, cmap = plt.cm.Greys_r)
     plt.subplots_adjust(left=0.3)
     
     axcolor = 'lightgoldenrodyellow'
     rax = plt.axes([0.05, 0.7, 0.15, 0.15], axisbg=axcolor)
     self.radio = RadioButtons(rax, self.Mlist)
     
     axS1 = self.fig.add_axes([0.25, 0.1, 0.65, 0.03], axisbg=axcolor)
     self.slider1 = Slider(axS1, 'Threshold', 1, 5000, valinit=self.param1)
         
     axS2 = self.fig.add_axes([0.25, 0.05, 0.65, 0.03], axisbg=axcolor)
     self.slider2 = Slider(axS2, 'Smoothnes', 1, 11, valinit=self.param2)
Exemplo n.º 16
0
def plot_cov_ellipse(cov, pos, volume=.5, ax=None, fc='none', ec=[1, 0, 0], a=1, lw=2):
    """
    Plots an ellipse enclosing *volume* based on the specified covariance
    matrix (*cov*) and location (*pos*). Additional keyword arguments are passed on to the
    ellipse patch artist.

    Parameters
    ----------
        cov : The 2x2 covariance matrix to base the ellipse on
        pos : The location of the center of the ellipse. Expects a 2-element
            sequence of [x0, y0].
        volume : The volume inside the ellipse; defaults to 0.5
        ax : The axis that the ellipse will be plotted on. Defaults to the
            current axis.
    """

    def eigsorted(cov):
        vals, vecs = np.linalg.eigh(cov)
        order = vals.argsort()[::-1]
        return vals[order], vecs[:, order]

    if ax is None:
        ax = plt.gca()

    vals, vecs = eigsorted(cov)
    theta = np.degrees(np.arctan2(*vecs[:, 0][::-1]))

    kwrg = {'facecolor': fc, 'edgecolor': ec, 'alpha': a, 'linewidth': lw}

    # Width and height are "full" widths, not radius
    width, height = 2 * np.sqrt(chi2.ppf(volume,2)) * np.sqrt(vals)
    ellip = Ellipse(xy=pos, width=width, height=height, angle=theta, **kwrg)

    ax.add_artist(ellip)
Exemplo n.º 17
0
Arquivo: sk.py Projeto: ai-se/x-effort
def minChi(parts, all):
  pi=math.pi
  def meanBlom(tests):
    return sum([test.blom for test in tests])/len(tests)
  def SSE(tests):
    return sum([test.blom**2 for test in tests])
  for part in parts:
    computeBlom(part, all)
  k = parts[0].n
  cut, left, right = None, None, None
  bestError = 0
  if len(parts) == 1:
    return cut, left, right
  totalError = meanBlom(parts)
  for i in range(1,len(parts)):
      lParts, rParts = parts[:i], parts[i:]
      error = k*(len(lParts)*((meanBlom(lParts) - totalError)**2) + len(rParts)*((meanBlom(rParts) - totalError)**2))
      if (error > bestError):
        bestError, cut, left, right = error, i, lParts, rParts
  v = k/(pi-2)
  lamda = (pi/(2*(pi-2)))*bestError/(SSE(parts)/v)
  chi = chi2.ppf(0.99, v)
  if lamda > chi:
    return cut, left, right
  return None, None, None
Exemplo n.º 18
0
def cov_ellipses(x, y, cov_mat=None, cov_tri=None, q=None, nsig=None, **kwargs):
    """Draw covariance error ellipses.

    Parameters
    ----------
    x, y : array (n,)
        Center of covariance ellipses.
    cov_mat : array (n, 2, 2), optional
        Covariance matrix.
    cov_tri : list of array (n,), optional
        Covariance matrix in flat form of (xvar, yvar, xycov).
    q : scalar or array
        Wanted (quantile) probability enclosed in error ellipse.
    nsig : scalar or array
        Probability in unit of standard error. Eg. `nsig = 1` means `q = 0.683`.
    kwargs :
        `ellipses` properties.
        Eg. c, vmin, vmax, alpha, edgecolor(ec), facecolor(fc), 
        linewidth(lw), linestyle(ls), norm, cmap, transform, etc.

    Reference
    ---------
    [1]: http://www.visiondummy.com/2014/04/draw-error-ellipse-representing-covariance-matrix
    [2]: http://stackoverflow.com/questions/12301071/multidimensional-confidence-intervals
    """
    from scipy.stats import norm, chi2

    if cov_mat is not None:
        cov_mat = np.asarray(cov_mat)
    elif cov_tri is not None:
        assert len(cov_tri) == 3
        cov_mat = np.array([[cov_tri[0], cov_tri[2]],
                            [cov_tri[2], cov_tri[1]]])
        cov_mat = cov_mat.transpose(range(2, cov_mat.ndim) + range(2))
        # Roll the first two dimensions (2, 2) to end.
    else:
        raise ValueError('One of `cov_mat` and `cov_tri` should be specified.')

    x, y = np.asarray(x), np.asarray(y)
    if not (cov_mat.shape[:-2] == x.shape == y.shape):
        raise ValueError('The shape of x, y and covariance are incompatible.')
    if not (cov_mat.shape[-2:] == (2, 2)):
        raise ValueError('Invalid covariance matrix shape.')

    if q is not None:
        q = np.asarray(q)
    elif nsig is not None:
        q = 2 * norm.cdf(nsig) - 1
    else:
        raise ValueError('One of `q` and `nsig` should be specified.')
    rho = chi2.ppf(q, 2)
    rho = rho.reshape(rho.shape + (1,) * x.ndim)  # raise dimensions

    val, vec = np.linalg.eigh(cov_mat)
    w = 2 * np.sqrt(val[..., 0] * rho)
    h = 2 * np.sqrt(val[..., 1] * rho)
    rot = np.degrees(np.arctan2(vec[..., 1, 0], vec[..., 0, 0]))

    return ellipses(x, y, w, h, rot=rot, **kwargs)
    """cov_cross
Exemplo n.º 19
0
def chi2_distribution():
    fig,ax = plt.subplots(1, 1)
    df = 10
    x=np.linspace(chi2.ppf(0.01, df),chi2.ppf(0.99, df), 100)
    ax.plot(x, chi2.pdf(x,df))

    y = []
    n=10
    for i in range(1000):
        chi2r = 0.0
        r = norm.rvs(size = 10)
        for j in range(10):
            chi2r = chi2r + r[j]**2
        y.append(chi2r)
    ax.hist(y, normed=True, alpha=0.2)
    plt.show()
Exemplo n.º 20
0
    def __init__(self, dataRange=None, delta=0.1, tau=0.1, vmin=None, spmin=None, \
                uniform=False, fullcovs=True, regularize=0):
        # configuration params
        self.dimension = dataRange.size
        self.vmin = vmin if vmin is not None else 2 * self.dimension
        self.spmin = spmin if spmin is not None else self.dimension + 1
        self.delta = delta
        self.tau = tau
        self.SIGMA = (self.delta * dataRange)**2
        self.maxDist = chi2.ppf(1 - self.tau, self.dimension)
        self.uniform = uniform
        self.fullcovs = fullcovs 
        self.regVal = regularize
        # components params
        self.priors = []
        self.means = []
        self.covs = []
        self.sps = []
        self.vs = []
        self.nc = 0
        # components outputs
        self.loglikes = []
        self.posts = []

        # Mahalanobis distance
        self.mahalaD = []
        
        # model likelihood
        self.dataLikelihood = 0
Exemplo n.º 21
0
    def getErrorEllipse(self, par1, par2, confLevel=0.6827):
        """
        Returns a, b, tan(2 theta) of confLevel error ellipse 
        in par1-par2-plane with:
        
        a: large half axis
        b: small half axis
        tan(2 theta): tilt angle, has to be divided by the aspect
                      ratio of the actual plot before taking arctan
        
        Formulae taken from arXiv:0906.4123
        """
        
        sigma1, sigma2 = self.getSigma(par1), self.getSigma(par2)
        cov = self.getCovariance(par1, par2)
        
        #for this we need sigma1 > sigma2, otherwise just swap parameters
        if sigma1 > sigma2:
          a_sq = (sigma1**2 + sigma2**2)/2. + np.sqrt((sigma1**2 - sigma2**2)**2/4. + cov**2)
          b_sq = (sigma1**2 + sigma2**2)/2. - np.sqrt((sigma1**2 - sigma2**2)**2/4. + cov**2)
        else:
          a_sq = (sigma2**2 + sigma1**2)/2. - np.sqrt((sigma2**2 - sigma1**2)**2/4. + cov**2)
          b_sq = (sigma2**2 + sigma1**2)/2. + np.sqrt((sigma2**2 - sigma1**2)**2/4. + cov**2)

        #Note: this has weird dimensions (actual size of the plot)!
        tan_2_th = 2.*cov / (sigma1**2 - sigma2**2)
        
        # we are dealing with a 2D error ellipse here
        scaling = np.sqrt(chi2.ppf(confLevel, 2))
        
        return scaling*np.sqrt(a_sq), scaling*np.sqrt(b_sq), tan_2_th
Exemplo n.º 22
0
def sampling_distribution():
    fig, ax = plt.subplots(1, 1)
    #display the probability density function
    df = 10
    x=np.linspace(chi2.ppf(0.01, df), chi2.ppf(0.99, df), 100)
    ax.plot(x, chi2.pdf(x, df))

    #simulate the sampling distribution
    y = []
    for i in range(1000):
        r = norm.rvs(loc=5, scale=2, size=df+1)
        rchi2 =(df)*np.var(r)/4
        y.append(rchi2)

    ax.hist(y, normed=True, alpha=0.2) 
    plt.savefig('sampling_distribution.png')
Exemplo n.º 23
0
    def stats(self,SigmaR,B_Col):
        #Chi Squared test
        print('Our Null Hypothesis states that the variance of our population = sample variance')

        #Variance of our radius from measured points
        observed = SigmaR

        #Expected variance (3mm per x,y,z observation)
        expected = .003**2+.003**2+.003**2

		#Calculation of degrees of freedom
        dof = B_Col-1

        #Calculation of test statistics
        teststatx = B_Col*((observed - expected)**2/expected)
        teststatx1 = dof*(observed/expected)

        #User is prompted to input desired significance level
        significance = np.float(input('Please specify the significance level: '))

        print(teststatx),
        print(teststatx1)

        #Using built in scipy.stats.chi2 function instead of looking up values on a table
        mean, var, skew, kurt = chi2.stats(dof, moments='mvsk')
        Chi = chi2.ppf((1-significance),dof)

		#If our sampled variance is greater than the population variance at the chosen significance level then we reject the null hypothesis at that significance level
        if teststatx > Chi:
            print 'We reject the null hypothesis at the ',significance,'significance level'

        else:
            print 'We fail to reject the null hypothesis at the ',significance,'significance level'

        print(teststatx, dof)
Exemplo n.º 24
0
def getMahalanobisRobust(dat, critical_alpha = 0.01, good_rows = np.zeros(0)):

    '''Calculate the Mahalanobis distance from the sample vector.'''
    
    
    if good_rows.size == 0:
        good_rows = np.any(~np.isnan(dat), axis=1);
    
    #import pdb
    #pdb.set_trace()

    try:

        robust_cov = MinCovDet().fit(dat[good_rows])
        mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat))
    except ValueError:
        #this step will fail if the covariance matrix is not singular. This happens if the data is not 
        #a unimodal symetric distribution. For example there is too many small noisy particles. Therefore
        #I will take a safe option and return zeros in the mahalanobis distance if this is the case.
        mahalanobis_dist = np.zeros(dat.shape[0])

    #critial distance of the maholanobis distance using the chi-square distirbution
    #https://en.wikiversity.org/wiki/Mahalanobis%27_distance
    #http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
    maha_lim = chi2.ppf(1-critical_alpha, dat.shape[1])
    outliers = mahalanobis_dist>maha_lim
    
    return mahalanobis_dist, outliers, maha_lim
Exemplo n.º 25
0
def param_intervals_1d(data, likes, paramstr='Parameter value'):  
    #Note that 'likes' is actually -2*lnL  
    cuts = data[np.where(0.5*(likes - np.min(likes)) < 0.5*chi2.ppf(0.683, 1))]
    plus = np.max(cuts) - data[np.argmin(likes)]
    minus = data[np.argmin(likes)] - np.min(cuts)
     
    #Print best fit...
    print "   " + paramstr + "    = (", '{:.3f}'.format(data[np.argmin(likes)]), " + ", '{:.3f}'.format(plus), " - ", '{:.3f}'.format(minus), ")"
Exemplo n.º 26
0
def cutting_point(k, alpha=0.5):
    """
    calculate cutting points for k categories of Gamma(alpha, beta=alpha)
    output: an array of k-1 cutting values, excluding 0 and inf
    """
    a = np.array(range(1, k), dtype=float) / k
    res = chi2.ppf(a, 2*alpha) / (2*alpha)
    return res
Exemplo n.º 27
0
def ci_cmle_a(X, v, theta_grid, alpha_level):
    cmle_a = np.empty_like(theta_grid)
    for l, theta_l in enumerate(theta_grid):
        logit_P_l = theta_l * v
        cmle_a[l] = -cond_a_nll(X, np.exp(logit_P_l))

    return invert_test(theta_grid, cmle_a - cmle_a.max(),
                       -0.5 * chi2.ppf(1 - alpha_level, 1))
Exemplo n.º 28
0
def calc_ncp(alpha, beta, df):
    x = chi2.ppf(1 - alpha, df)

    def to_minimize(ncp):
        return math.fabs(beta - ncx2.cdf(x, df, ncp))

    res = minimize_scalar(to_minimize, method="golden")
    return res.x
Exemplo n.º 29
0
def scale_errors(cov_axes, confidence_level=0.95):
    """
    Returns major axes of error ellipse or
    hyperbola, rescaled using chi2 test statistic
    """
    dof = len(cov_axes)
    x2t = chi2.ppf(confidence_level,dof)
    return N.sqrt(x2t*cov_axes)
Exemplo n.º 30
0
        for rep in range(replications):
            start = time.time()
            marginals_rand = np.random.multinomial(N, true_marginals.flatten(), size=1) / N
            marginals_rand = np.reshape(marginals_rand, [2, 2])
            while not (marginals_rand[1, 1] * marginals_rand[0, 1]):
                marginals_rand = np.random.multinomial(N, true_marginals.flatten(), size=1) / N
                marginals_rand = np.reshape(marginals_rand, [2, 2])
            marginals_rand = np.reshape(marginals_rand, [2, 2])

            data, data_label, data_sensitive = upload_data(ds=ds, n_samples=N, marginals=marginals_rand)
            data_tuple = [data, data_sensitive, data_label]

            dist = calculate_distance_eqopp(data_tuple, np.array([0, 1]), 0.5)
            s_hat.append(dist)
            asympt = limiting_dist_EQOPP(data_tuple, np.array([0, 1]), 0.5)
            threshold = asympt * chi2.ppf(.9, 1)
            cnt = cnt + (dist>threshold)
            average_0 = average_0 + asympt / replications
            average_1 = average_1 + dist / replications
      
        print(cnt)
        print(average_0,average_1)
        '''
        plt.figure()
        counts__, bins__, _ = plt.hist(s_hat,
                             density=True,
                             bins = np.linspace(0,3,30),
                             range = [0,3],
                             alpha=0.2,
                             edgecolor='black',
                             linewidth=1.3,
Exemplo n.º 31
0
    y_strap = Bootstrap[i]['y']
    # On met en place X = (1,X) et on rajoute un dimension à y, et on prépare notre theta
    X_strap = np.c_[np.ones((X_strap.shape[0], 1)), X_strap]
    y_strap = y_strap[:, np.newaxis]
    theta_strap = np.zeros((X_strap.shape[1], 1))
    resultat_general.append(algorithme(X_strap, y_strap, theta_strap))

std_theta = pd.DataFrame(resultat_general).std()

print('la variance est:', std_theta * std_theta)
#%%
# test de Wald
indice_bon = []
for i in range(1, len(theta)):
    T = resultats[i] * resultats[i] / (std_theta[i] * std_theta[i])
    if T >= chi2.ppf(0.95, 1):
        indice_bon.append(X_col_names[i - 1])
#%%
X = data[indice_bon].astype(int)
y = data['y'].astype(int)
#%%
# On met en place X = (1,X) et on rajoute un dimension à y, et on prépare notre theta
X = np.c_[np.ones((X.shape[0], 1)), X]
y = y[:, np.newaxis]
theta = np.zeros((X.shape[1], 1))
# On découpe les données en 2 échantillons
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
#%%
Exemplo n.º 32
0
    def ci_mean(self,
                sig=.05,
                method='gamma',
                epsilon=10**-8,
                gamma_low=-10**10,
                gamma_high=10**10):
        """
        Returns the confidence interval for the mean.

        Parameters
        ----------
        sig : float
            significance level. Default is .05

        method : str
            Root finding method,  Can be 'nested-brent' or
            'gamma'.  Default is 'gamma'

            'gamma' Tries to solve for the gamma parameter in the
            Lagrange (see Owen pg 22) and then determine the weights.

            'nested brent' uses brents method to find the confidence
            intervals but must maximize the likelihhod ratio on every
            iteration.

            gamma is generally much faster.  If the optimizations does not
            converge, try expanding the gamma_high and gamma_low
            variable.

        gamma_low : float
            Lower bound for gamma when finding lower limit.
            If function returns f(a) and f(b) must have different signs,
            consider lowering gamma_low.

        gamma_high : float
            Upper bound for gamma when finding upper limit.
            If function returns f(a) and f(b) must have different signs,
            consider raising gamma_high.

        epsilon : float
            When using 'nested-brent', amount to decrease (increase)
            from the maximum (minimum) of the data when
            starting the search.  This is to protect against the
            likelihood ratio being zero at the maximum (minimum)
            value of the data.  If data is very small in absolute value
            (<10 ``**`` -6) consider shrinking epsilon

            When using 'gamma', amount to decrease (increase) the
            minimum (maximum) by to start the search for gamma.
            If fucntion returns f(a) and f(b) must have differnt signs,
            consider lowering epsilon.

        Returns
        -------
        Interval : tuple
            Confidence interval for the mean
        """
        endog = self.endog
        sig = 1 - sig
        if method == 'nested-brent':
            self.r0 = chi2.ppf(sig, 1)
            middle = np.mean(endog)
            epsilon_u = (max(endog) - np.mean(endog)) * epsilon
            epsilon_l = (np.mean(endog) - min(endog)) * epsilon
            ulim = optimize.brentq(self._ci_limits_mu, middle,
                                   max(endog) - epsilon_u)
            llim = optimize.brentq(self._ci_limits_mu, middle,
                                   min(endog) + epsilon_l)
            return llim, ulim

        if method == 'gamma':
            self.r0 = chi2.ppf(sig, 1)
            gamma_star_l = optimize.brentq(self._find_gamma, gamma_low,
                                           min(endog) - epsilon)
            gamma_star_u = optimize.brentq(self._find_gamma, \
                         max(endog) + epsilon, gamma_high)
            weights_low = ((endog - gamma_star_l) ** -1) / \
                np.sum((endog - gamma_star_l) ** -1)
            weights_high = ((endog - gamma_star_u) ** -1) / \
                np.sum((endog - gamma_star_u) ** -1)
            mu_low = np.sum(weights_low * endog)
            mu_high = np.sum(weights_high * endog)
            return mu_low, mu_high
Exemplo n.º 33
0
def cluster_datasets(truncated_datasets,
                     residues,
                     out_dir,
                     f,
                     phi,
                     ):
    print("\tClustering {} datasets".format(len(truncated_datasets)))

    reference_dataset = get_reference_dataset(truncated_datasets)

    alignments = get_alignments(residues,
                                )

    aligned_maps, distances = align_maps(reference_dataset,
                                         truncated_datasets,
                                         alignments,
                                         f,
                                         phi,
                                         )
    print("\tRange of distances is {} {}".format(min(distances),
                                                 max(distances),
                                                 ))

    data = np.vstack([aligned_map.flatten() for aligned_map in aligned_maps])
    models = {}
    try:
        for i in range(1, 10):
            print(i)
            model = GaussianMixture(n_components=i, covariance_type="diag", verbose=2)
            model.fit(data)
            models[i] = model

    except Exception as e:
        print(e)
        print("Model became undefined!")

    bics = {model_num: model.bic(data) for model_num, model in models.items()}
    print(bics)
    model = min(list(bics.items()), key=lambda x: x[1])
    print("Best model is {}".format(model))
    model = models[model[0]]
    classes = model.predict(data)

    # outlier_distance = sample_outlier_distance(model)
    outliers = {}
    clusters = []
    print(model.means_.shape)
    print(classes)
    for i in range(model.means_.shape[0]):
        print("\tProcessing component: {}".format(i))
        means = model.means_[i, :].flatten()
        precs = np.diag(model.precisions_[i, :].flatten())

        outlier_distance = np.sqrt(chi2.ppf(0.95, means.size))
        print("Outlier distance: {}".format(outlier_distance))
        cluster_maps = {dtag: aligned_maps[j]
                        for j, dtag
                        in enumerate(list(residues.keys()))
                        if classes[j] == i
                        }


        cluster_outliers = map_list(lambda x: classify_distance(x, outlier_distance, means, precs),
                            cluster_maps.values(),
                            )
        for j, dtag in enumerate(list(cluster_maps.keys())):
            if cluster_outliers[j] == 1:
                outliers[dtag] = 1
            else:
                outliers[dtag] = 0

        inliers = {dtag: aligned_maps[i] for i, dtag in enumerate(list(cluster_maps.keys())) if outliers[dtag] == 0}
        clusters.append(inliers)


    # outliers = []
    # for xmap in aligned_maps:
    #     distance = gaussian_distance(xmap, model)
    #     # distance = probability_distance(xmap.reshape(1,-1), model)
    #     print(distance)
    #     if distance < outlier_distance:
    #         outliers.append(1)
    #     else:
    #         outliers.append(0)

    individual_outliers = [{dtag: aligned_maps[i]} for i, dtag in enumerate(list(residues.keys())) if outliers[dtag] == 1]

    return clusters + individual_outliers
Exemplo n.º 34
0
    def _likelihood_ratio_confint(self, alpha: float) -> List[float]:
        """Compute the likelihood ratio confidence interval for the MLE of the previous run.

        Args:
            alpha: Specifies the (1 - alpha) confidence level (0 < alpha < 1).

        Returns:
            The likelihood ratio confidence interval.
        """
        # Compute the two intervals in which we the look for values above
        # the likelihood ratio: the two bubbles next to the QAE estimate
        M = self._M  # pylint: disable=invalid-name
        qae = self._ret['value']

        y = int(np.round(M * np.arcsin(np.sqrt(qae)) / np.pi))
        if y == 0:
            right_of_qae = np.sin(np.pi * (y + 1) / M)**2
            bubbles = [qae, right_of_qae]

        elif y == int(M / 2):  # remember, M = 2^m is a power of 2
            left_of_qae = np.sin(np.pi * (y - 1) / M)**2
            bubbles = [left_of_qae, qae]

        else:
            left_of_qae = np.sin(np.pi * (y - 1) / M)**2
            right_of_qae = np.sin(np.pi * (y + 1) / M)**2
            bubbles = [left_of_qae, qae, right_of_qae]

        # likelihood function
        a_i = np.asarray(self._ret['values'])
        p_i = np.asarray(self._ret['probabilities'])
        m = self._m
        shots = self._ret['shots']

        def loglikelihood(a):
            return np.sum(shots * p_i * np.log(pdf_a(a_i, a, m)))

        # The threshold above which the likelihoods are in the
        # confidence interval
        loglik_mle = loglikelihood(self._ret['ml_value'])
        thres = loglik_mle - chi2.ppf(1 - alpha, df=1) / 2

        def cut(x):
            return loglikelihood(x) - thres

        # Store the boundaries of the confidence interval
        # It's valid to start off with the zero-width confidence interval, since the maximum
        # of the likelihood function is guaranteed to be over the threshold, and if alpha = 0
        # that's the valid interval
        lower = upper = self._ret['ml_value']

        # Check the two intervals/bubbles: check if they surpass the
        # threshold and if yes add the part that does to the CI
        for a, b in zip(bubbles[:-1], bubbles[1:]):
            # Compute local maximum and perform a bisect search between
            # the local maximum and the bubble boundaries
            locmax, val = bisect_max(loglikelihood, a, b, retval=True)
            if val >= thres:
                # Bisect pre-condition is that the function has different
                # signs at the boundaries of the interval we search in
                if cut(a) * cut(locmax) < 0:
                    left = bisect(cut, a, locmax)
                    lower = np.minimum(lower, left)
                if cut(locmax) * cut(b) < 0:
                    right = bisect(cut, locmax, b)
                    upper = np.maximum(upper, right)

        # Put together CI
        confint = [lower, upper]
        return [self.post_processing(bound) for bound in confint]
Exemplo n.º 35
0
    def perform_chi2_test(self,
                          v_in_out=None,
                          delta_in_out=None,
                          calculate_voltage_angles=True,
                          chi2_prob_false=0.05):
        """
        The function perform_chi2_test performs a Chi^2 test for bad data and topology error
        detection. The function can be called with the optional input arguments v_in_out and
        delta_in_out. Then, the Chi^2 test is performed after calling the function estimate using
        them as input arguments. It can also be called without these arguments if it is called
        from the same object with which estimate had been called beforehand. Then, the Chi^2 test is
        performed for the states estimated by the funtion estimate and the result, the existence of bad data,
        is given back as a boolean. As a optional argument the probability
        of a false measurement can be provided additionally. For bad data detection, the function
        perform_rn_max_test is more powerful and should be the function of choice. For topology
        error detection, however, perform_chi2_test should be used.

        INPUT:
            **v_in_out** (np.array, shape=(1,), optional) - Vector with initial values for all
            voltage magnitudes in p.u. (sorted by bus index)

            **delta_in_out** (np.array, shape=(1,), optional) - Vector with initial values for all
            voltage angles in degrees (sorted by bus index)

        OPTIONAL:
            **calculate_voltage_angles** - (boolean) - Take into account absolute voltage angles and phase
            shifts in transformers, if init is 'slack'. Default is True

            **chi2_prob_false** (float) - probability of error / false alarms (standard value: 0.05)

        OUTPUT:
            **successful** (boolean) - True if bad data has been detected

        EXAMPLE:
            perform_chi2_test(np.array([1.0, 1.0, 1.0]), np.array([0.0, 0.0, 0.0]), 0.97)

        """
        # perform SE
        self.estimate(v_in_out, delta_in_out, calculate_voltage_angles)

        # Performance index J(hx)
        J = np.dot(self.solver.r.T, np.dot(self.solver.R_inv, self.solver.r))

        # Number of measurements
        m = len(self.net.measurement)

        # Number of state variables (the -1 is due to the reference bus)
        n = len(self.solver.eppci.v) + len(self.solver.eppci.delta) - 1

        # Chi^2 test threshold
        test_thresh = chi2.ppf(1 - chi2_prob_false, m - n)

        # Print results
        self.logger.debug("Result of Chi^2 test:")
        self.logger.debug("Number of measurements: %d" % m)
        self.logger.debug("Number of state variables: %d" % n)
        self.logger.debug("Performance index: %.2f" % J)
        self.logger.debug("Chi^2 test threshold: %.2f" % test_thresh)

        if J <= test_thresh:
            self.bad_data_present = False
            self.logger.debug(
                "Chi^2 test passed. No bad data or topology error detected.")
        else:
            self.bad_data_present = True
            self.logger.debug(
                "Chi^2 test failed. Bad data or topology error detected.")

        if self.solver.successful:
            return self.bad_data_present
Exemplo n.º 36
-1
def poisson_interval(n,alpha=0.05):
    a=alpha
    low,high = (chi2.ppf(a/2,2*n)/2, chi2.ppf(1-a/2,2*n+2)/2)
    if n==0:
        low=0.0
        pass
    return low,high