Пример #1
0
def wilcoxon(x,y=None):
    """
Calculates the Wilcoxon signed-rank test for the null hypothesis that two samples come from the same distribution. A non-parametric T-test. (need N > 20)

Returns: t-statistic, two-tailed p-value
"""
    if y is None:
        d = x
    else:
        x, y = map(asarray, (x, y))
        if len(x) <> len(y):
            raise ValueError, 'Unequal N in wilcoxon.  Aborting.'
        d = x-y
    d = compress(not_equal(d,0),d) # Keep all non-zero differences
    count = len(d)
    if (count < 10):
        print "Warning: sample size too small for normal approximation."
    r = stats.rankdata(abs(d))
    r_plus = sum((d > 0)*r)
    r_minus = sum((d < 0)*r)
    T = min(r_plus, r_minus)
    mn = count*(count+1.0)*0.25
    se = math.sqrt(count*(count+1)*(2*count+1.0)/24)
    if (len(r) != len(unique(r))):  # handle ties in data
        replist, repnum = find_repeats(r)
        corr = 0.0
        for i in range(len(replist)):
            si = repnum[i]
            corr += 0.5*si*(si*si-1.0)
        V = se*se - corr
        se = sqrt((count*V - T*T)/(count-1.0))
    z = (T - mn)/se
    prob = 2*(1.0 -stats.zprob(abs(z)))
    return T, prob
Пример #2
0
def _apply_func(x,g,func):
    # g is list of indices into x
    #  separating x into different groups
    #  func should be applied over the groups
    g = unique(r_[0,g,len(x)])
    output = []
    for k in range(len(g)-1):
        output.append(func(x[g[k]:g[k+1]]))
    return asarray(output)
Пример #3
0
def ansari(x,y):
    """Determine if the scale parameter for two distributions with equal
    medians is the same using the Ansari-Bradley statistic.

    Specifically, compute the AB statistic and the probability of error
    that the null hypothesis is true but rejected with the computed
    statistic as the critical value.

    One can reject the null hypothesis that the ratio of variances is 1 if
    returned probability of error is small (say < 0.05)
    """
    x,y = asarray(x),asarray(y)
    n = len(x)
    m = len(y)
    if (m < 1):
        raise ValueError, "Not enough other observations."
    if (n < 1):
        raise ValueError, "Not enough test observations."
    N = m+n
    xy = r_[x,y]  # combine
    rank = stats.rankdata(xy)
    symrank = amin(array((rank,N-rank+1)),0)
    AB = sum(symrank[:n])
    uxy = unique(xy)
    repeats = (len(uxy) != len(xy))
    exact = ((m<55) and (n<55) and not repeats)
    if repeats and ((m < 55)  or (n < 55)):
        print "Ties preclude use of exact statistic."
    if exact:
        astart, a1, ifault = statlib.gscale(n,m)
        ind = AB-astart
        total = sum(a1)
        if ind < len(a1)/2.0:
            cind = int(ceil(ind))
            if (ind == cind):
                pval = 2.0*sum(a1[:cind+1])/total
            else:
                pval = 2.0*sum(a1[:cind])/total
        else:
            find = int(floor(ind))
            if (ind == floor(ind)):
                pval = 2.0*sum(a1[find:])/total
            else:
                pval = 2.0*sum(a1[find+1:])/total
        return AB, min(1.0,pval)
    
    # otherwise compute normal approximation
    if N % 2:  # N odd
        mnAB = n*(N+1.0)**2 / 4.0 / N
        varAB = n*m*(N+1.0)*(3+N**2)/(48.0*N**2)
    else:
        mnAB = n*(N+2.0)/4.0
        varAB = m*n*(N+2)*(N-2.0)/48/(N-1.0)
    if repeats:   # adjust variance estimates
        # compute sum(tj * rj**2)
        fac = sum(symrank**2)
        if N % 2: # N odd
            varAB = m*n*(16*N*fac-(N+1)**4)/(16.0 * N**2 * (N-1))
        else:  # N even
            varAB = m*n*(16*fac-N*(N+2)**2)/(16.0 * N * (N-1))
    z = (AB - mnAB)/sqrt(varAB)
    pval = (1-distributions.norm.cdf(abs(z)))*2.0
    return AB, pval