Exemplo n.º 1
0
def boxcox_llf(lmb, data):
    """The boxcox log-likelihood function.
    """
    N = len(data)
    y = boxcox(data,lmb)
    my = stats.mean(y)
    f = (lmb-1)*sum(log(data))
    f -= N/2.0*log(sum((y-my)**2.0/N))
    return f
Exemplo n.º 2
0
def boxcox(x,lmbda=None,alpha=None):
    """Return a positive dataset tranformed by a Box-Cox power transformation.

    If lmbda is not None, do the transformation for that value.

    If lmbda is None, find the lambda that maximizes the log-likelihood
    function and return it as the second output argument.

    If alpha is not None, return the 100(1-alpha)% confidence interval for
    lambda as the third output argument. 
    """
    if any(x < 0):
        raise ValueError, "Data must be positive."
    if lmbda is not None:  # single transformation
        lmbda = lmbda*(x==x)
        y = where(lmbda == 0, log(x), (x**lmbda - 1)/lmbda)
        return y
    # Otherwise find the lmbda that maximizes the log-likelihood function.
    def tempfunc(lmb, data):  # function to minimize
        return -boxcox_llf(lmb,data)
    lmax = optimize.brent(tempfunc, brack=(-2.0,2.0),args=(x,))
    y, lmax = boxcox(x, lmax)
    if alpha is None:
        return y, lmax
    # Otherwise find confidence interval
    interval = _boxcox_conf_interval(x, lmax, alpha)
    return y, lmax, interval
Exemplo n.º 3
0
def bartlett(*args):
    """Perform Bartlett test with the null hypothesis that all input samples
    have equal variances.

    Inputs are sample vectors:  bartlett(x,y,z,...) 

    Outputs: (T, pval)

         T    -- the Test statistic
         pval -- significance level if null is rejected with this value of T
                 (prob. that null is true but rejected with this p-value.)  

    Sensitive to departures from normality.  The Levene test is
    an alternative that is less sensitive to departures from
    normality.

    References:

      http://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm

      Snedecor, George W. and Cochran, William G. (1989), Statistical
        Methods, Eighth Edition, Iowa State University Press.     
    """
    k = len(args)
    if k < 2:
        raise ValueError, "Must enter at least two input sample vectors."
    Ni = zeros(k)
    ssq = zeros(k,'d')
    for j in range(k):
        Ni[j] = len(args[j])
        ssq[j] = stats.var(args[j])
    Ntot = sum(Ni)
    spsq = sum((Ni-1)*ssq)/(1.0*(Ntot-k))
    numer = (Ntot*1.0-k)*log(spsq) - sum((Ni-1.0)*log(ssq))
    denom = 1.0 + (1.0/(3*(k-1)))*((sum(1.0/(Ni-1.0)))-1.0/(Ntot-k))
    T = numer / denom
    pval = distributions.chi2.sf(T,k-1) # 1 - cdf
    return T, pval
Exemplo n.º 4
0
def anderson(x,dist='norm'):
    """Anderson and Darling test for normal, exponential, or Gumbel
    (Extreme Value Type I) distribution.

    Given samples x, return A2, the Anderson-Darling statistic,
    the significance levels in percentages, and the corresponding
    critical values.

    Critical values provided are for the following significance levels
    norm/expon:   15%, 10%, 5%, 2.5%, 1%
    Gumbel:       25%, 10%, 5%, 2.5%, 1%
    logistic:     25%, 10%, 5%, 2.5%, 1%, 0.5%

    If A2 is larger than these critical values then for that significance
    level, the hypothesis that the data come from a normal (exponential)
    can be rejected.
    """
    if not dist in ['norm','expon','gumbel','extreme1','logistic']:
        raise ValueError, "Invalid distribution."
    y = sort(x)
    xbar = stats.mean(x)
    N = len(y)
    if dist == 'norm':
        s = stats.std(x)
        w = (y-xbar)/s
        z = distributions.norm.cdf(w)
        sig = array([15,10,5,2.5,1])
        critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N),3)
    elif dist == 'expon':
        w = y / xbar
        z = distributions.expon.cdf(w)
        sig = array([15,10,5,2.5,1])
        critical = around(_Avals_expon / (1.0 + 0.6/N),3)
    elif dist == 'logistic':
        def rootfunc(ab,xj,N):
            a,b = ab
            tmp = (xj-a)/b
            tmp2 = exp(tmp)
            val = [sum(1.0/(1+tmp2))-0.5*N,
                   sum(tmp*(1.0-tmp2)/(1+tmp2))+N]
            return array(val)
        sol0=array([xbar,stats.std(x)])
        sol = optimize.fsolve(rootfunc,sol0,args=(x,N),xtol=1e-5)
        w = (y-sol[0])/sol[1]
        z = distributions.logistic.cdf(w)
        sig = array([25,10,5,2.5,1,0.5])
        critical = around(_Avals_logistic / (1.0+0.25/N),3)
    else:
        def fixedsolve(th,xj,N):
            val = stats.sum(xj)*1.0/N
            tmp = exp(-xj/th)
            term = sum(xj*tmp)
            term /= sum(tmp)
            return val - term
        s = optimize.fixed_point(fixedsolve, 1.0, args=(x,N),xtol=1e-5)
        xbar = -s*log(sum(exp(-x/s))*1.0/N)
        w = (y-xbar)/s
        z = distributions.gumbel_l.cdf(w)
        sig = array([25,10,5,2.5,1])
        critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)),3)
    i = arange(1,N+1)
    S = sum((2*i-1.0)/N*(log(z)+log(1-z[::-1])))
    A2 = -N-S
    return A2, critical, sig