Пример #1
0
def chi_square_from_Dict2D(data):
    """Chi Square test on a Dict2D

    data is a Dict2D. The values are a list of the observed (O)
    and expected (E) frequencies,(can be created with calc_contingency_expected)

    The chi-square value (test) is the sum of (O-E)^2/E over the items in data

    degrees of freedom are calculated from data as:
    (r-1)*(c-1) if cols and rows are both > 1
    otherwise is just 1 - the # of rows or columns
    (whichever is greater than 1)
    
    """
    test =  sum([((item[0] - item[1]) * (item[0] - item[1]))/item[1] \
                   for item in data.Items])
    num_rows = len(data)
    num_cols = len([col for col in data.Cols])
    if num_rows == 1:
        df = num_cols - 1
    elif num_cols == 1:
        df = num_rows - 1
    elif num_rows == 0 or num_cols == 0:
        raise ValueError, "data matrix must have data"
    else:
        df = (len(data) - 1) * (len([col for col in data.Cols]) - 1)
    
    return test, chi_high(test, df)
Пример #2
0
def G_fit(obs, exp, williams=1):
    """G test for fit between two lists of counts.

    Usage: test, prob = G_fit(obs, exp, williams)
    
    obs and exp are two lists of numbers.
    williams is a boolean stating whether to do the Williams correction.
    
    SUM(2 f(obs)ln (f(obs)/f(exp)))
    
    See Sokal and Rohlf chapter 17.
    """
    k = len(obs)
    if k != len(exp):
        raise ValueError, "G_fit requires two lists of equal length."
    G = 0
    n = 0
    
    for o, e in zip(obs, exp):
        if o < 0:
            raise ValueError, \
            "G_fit requires all observed values to be positive."
        if e <= 0:
            raise ZeroExpectedError, \
            "G_fit requires all expected values to be positive."
        if o:   #if o is zero, o * log(o/e) must be zero as well.
            G += o * log(o/e)
            n += o
    
    G *= 2
    if williams:
        q = 1 + (k + 1)/(6*n)
        G /= q

    return G, chi_high(G, k - 1)
Пример #3
0
def fisher(probs):
    """Uses Fisher's method to combine multiple tests of a hypothesis.

    -2 * SUM(ln(P)) gives chi-squared distribution with 2n degrees of freedom.
    """
    try:
        return chi_high(-2 * sum(map(log, probs)), 2 * len(probs))
    except OverflowError, e:
        return 0.0 
Пример #4
0
def G_ind(m, williams=False):
    """Returns G test for independence in an r x c table.
    
    Requires input data as a numpy array. From Sokal and Rohlf p 738.
    """
    f_ln_f_elements = safe_sum_p_log_p(m)
    f_ln_f_rows = safe_sum_p_log_p(sum(m,0))
    f_ln_f_cols = safe_sum_p_log_p(sum(m,1))
    tot = sum(ravel(m))
    f_ln_f_table = tot * log(tot)

    df = (len(m)-1) * (len(m[0])-1)
    G = 2*(f_ln_f_elements-f_ln_f_rows-f_ln_f_cols+f_ln_f_table)
    if williams:
        q = 1+((tot*sum(1.0/sum(m,1))-1)*(tot*sum(1.0/sum(m,0))-1)/ \
            (6*tot*df))
        G = G/q
    return G, chi_high(max(G,0), df)
Пример #5
0
 def test_chi_high(self):
     """chi_high should match R's pchisq(lower.tail=FALSE) function"""
     probs = {
     1:  [   1.000000e+00, 9.203443e-01, 7.518296e-01, 4.795001e-01,
             3.173105e-01, 1.572992e-01, 2.534732e-02, 1.565402e-03,
             7.744216e-06, 4.320463e-08, 1.537460e-12, 2.088488e-45,
         ],
     10: [   1.000000e+00, 1.000000e-00, 1.000000e-00, 9.999934e-01,
             9.998279e-01, 9.963402e-01, 8.911780e-01, 4.404933e-01,
             2.925269e-02, 8.566412e-04, 2.669083e-07, 1.613931e-37,
         ],
     100:[   1.00000e+00, 1.00000e+00, 1.00000e+00, 1.00000e+00,
             1.00000e+00, 1.00000e+00, 1.00000e+00, 1.00000e+00,
             1.00000e+00, 1.00000e+00, 9.99993e-01, 1.17845e-08,
         ],
     }
     
     for df in self.df:
         for x, p in zip(self.values, probs[df]):
             self.assertFloatEqual(chi_high(x, df), p)
Пример #6
0
def G_2_by_2(a, b, c, d, williams=1, directional=1):
    """G test for independence in a 2 x 2 table.

    Usage: G, prob = G_2_by_2(a, b, c, d, willliams, directional)

    Cells are in the order:
    
        a b
        c d
    
    a, b, c, and d can be int, float, or long.
    williams is a boolean stating whether to do the Williams correction.
    directional is a boolean stating whether the test is 1-tailed.
    
    Briefly, computes sum(f ln f) for cells - sum(f ln f) for
    rows and columns + f ln f for the table.
    
    Always has 1 degree of freedom

    To generalize the test to r x c, use the same protocol:
    2*(cells - rows/cols + table), then with (r-1)(c-1) df.

    Note that G is always positive: to get a directional test,
    the appropriate ratio (e.g. a/b > c/d) must be tested
    as a separate procedure. Find the probability for the
    observed G, and then either halve or halve and subtract from
    one depending on whether the directional prediction was
    upheld. 
    
    The default test is now one-tailed (Rob Knight 4/21/03).

    See Sokal & Rohlf (1995), ch. 17. Specifically, see box 17.6 (p731).
    """
    cells = [a, b, c, d]
    n = sum(cells)
    #return 0 if table was empty
    if not n:
        return (0, 1)
    #raise error if any counts were negative
    if min(cells) < 0:
        raise ValueError, \
        "G_2_by_2 got negative cell counts(s): must all be >= 0."
    
    G = 0
    #Add x ln x for items, adding zero for items whose counts are zero
    for i in filter(None, cells):
        G += i * log(i)
    #Find totals for rows and cols
    ab = a + b
    cd = c + d
    ac = a + c
    bd = b + d
    rows_cols = [ab, cd, ac, bd]
    #exit if we are missing a row or column entirely: result counts as
    #never significant
    if min(rows_cols) == 0:
        return (0, 1)
    #Subtract x ln x for rows and cols
    for i in filter(None, rows_cols):
        G -= i * log(i)
    #Add x ln x for table
    G += n * log(n)
    #Result needs to be multiplied by 2 
    G *= 2

    #apply Williams correction
    if williams:
        q = 1 + ((  ( (n/ab) + (n/cd) ) -1 ) * ( ( (n/ac) + (n/bd) ) -1))/(6*n)
        G /= q

    p = chi_high(max(G,0), 1)
    
    #find which tail we were in if the test was directional
    if directional:
        is_high =  ((b == 0) or (d != 0 and (a/b > c/d)))
        p = tail(p, is_high)
        if not is_high:
            G = -G
    return G, p