def gh(sample_a, sample_b, **kwargs): ''' Calculate Games-Howell from two samples Sample A and Sample B are array-like data stores Ideally they should be numpy arrays or pandas Series So we can perform mean and standard deviation calculations with them This functions will return the mean difference and the p-value ''' # Retrieve argument(s) r = kwargs.get('r') # For Games-Howell, we'll have to calculate a custom standard error # And custom df to get q statistic mean_a = sample_a.mean() var_a = sample_a.var() count_a = sample_a.count() s2n_a = var_a / count_a mean_b = sample_b.mean() var_b = sample_b.var() count_b = sample_b.count() s2n_b = var_b / count_b standard_error = sqrt((1/2) * (s2n_a + s2n_b)) mean_diff = mean_a - mean_b q = abs(mean_diff) / standard_error # Next, calculate custom df df_numer = (s2n_a + s2n_b)**2 df_denom = (s2n_a**2 / (count_a - 1)) + (s2n_b**2 / (count_b - 1)) df = df_numer / df_denom p = psturng(q, r, df) return mean_diff, p
def test_1000_random_values(self): n = 1000 ps = np.random.random(n)*(.999 - .1) + .1 rs = np.random.random_integers(2, 100, n) vs = np.random.random(n)*998. + 2. qs = qsturng(ps, rs, vs) estimates = psturng(qs, rs, vs) actuals = 1. - ps errors = estimates - actuals assert_equal(np.array([]), np.where(errors > 1e-5)[0])
def test_vector(self): "vector input -> vector output" assert_array_almost_equal(np.array([0.10679889, 0.06550009, 0.01730145]), psturng([3.98832389, 4.56835318, 6.26400894], [4, 4, 4], [6, 6, 6]), 5)
def test_handful_to_known_values(self): cases = [(0.71499578726111435, 67, 956.70742488392386, 5.0517658443070692), (0.42974234855067672, 16, 723.50261736502318, 3.3303582093701354), (0.94936429359548424, 2, 916.1867328010926, 2.7677975546417244), (0.85357381770725038, 66, 65.67055060832368, 5.5647438108270109), (0.87372108021900929, 74, 626.42369474993632, 5.5355540570701107), (0.53891960564713726, 49, 862.63799438485785, 4.5108645923377146), (0.98818659555664567, 18, 36.269686711464274, 6.0906643750886156), (0.53031994896037626, 50, 265.29558652727917, 4.5179640079726795), (0.7318857887397332, 59, 701.41497552251201, 4.9980139875409915), (0.65332019368982697, 61, 591.01183664195912, 4.8706581766706893), (0.55403221657248558, 77, 907.34156725405194, 4.8786135917984632), (0.30783916857266003, 83, 82.446923487980882, 4.4396401242858294), (0.29321720242415661, 16, 709.64382575553009, 3.0304277540702729), (0.27146478168880306, 31, 590.00594683574172, 3.5870031664477215), (0.67348796958433776, 81, 608.02706111127657, 5.1096199974432936), (0.32774393945968938, 18, 17.706224399250839, 3.2119038163765432), (0.7081637474795982, 72, 443.10678914889695, 5.0990030889410649), (0.33354939276757861, 47, 544.0772192199048, 4.0613352964193279), (0.60412143947363051, 36, 895.83526933271548, 4.381717596850172), (0.88739052300665977, 77, 426.03665511558262, 5.6333929480341309)] for p,r,v,q in cases: assert_almost_equal(1.-p, psturng(q,r,v), 5)
def tukey(sample_a, sample_b, **kwargs): ''' Calculate Tukey's HSD and significance from two samples Sample A and Sample B are array-like data stores Ideally they should be numpy arrays or pandas Series So we can perform mean and standard deviation calculations with them We'll also pass the Mean Squares Within here as msw This functions will return the mean difference and the p-value r: number of samples in total df: degrees of freedom - this will be the sum of (count of each sample -1) ''' # Retrieve arguments msw = kwargs.get('msw') r = kwargs.get('r') df = kwargs.get('df') mean_a = sample_a.mean() count_a = sample_a.count() mean_b = sample_b.mean() count_b = sample_b.count() standard_error = sqrt(msw * (1/2) * (1/count_a + 1/count_b)) mean_diff = mean_a - mean_b q = abs(mean_diff) / standard_error p = psturng(q, r, df) return mean_diff, p
def test_v_equal_one(self): assert_almost_equal(.1, psturng(.2,5,1), 5)
def test_scalar(self): "scalar input -> scalar output" assert_almost_equal(.1, psturng(4.43645545899562,5,6), 5)
import math import time import numpy as np import pylab from qsturng import qsturng, psturng, v_keys from qsturng.make_tbls import R n = 100 ps = np.random.random(n) * (.999 - .1) + .1 rs = np.random.random_integers(2, 100, n) vs = np.random.random(n) * 998. + 2. qs = qsturng(ps, rs, vs) t0 = time.time() estimates = psturng(qs, rs, vs) import pprint pprint.pprint([(p, r, v, q) for p, r, v, q in zip(ps, rs, vs, qs)]) print time.time() - t0 actuals = 1. - ps errors = estimates - actuals pylab.figure() pylab.hist(errors, bins=100) yticks = pylab.yticks()[0] pylab.yticks(yticks, [r'$%i$' % t for t in yticks]) xticks = pylab.xticks()[0] pylab.xticks(xticks, [r'$%.0e$' % t for t in xticks]) pylab.text(0,
import math import time import numpy as np import pylab from qsturng import qsturng, psturng, v_keys from qsturng.make_tbls import R n = 100 ps = np.random.random(n)*(.999 - .1) + .1 rs = np.random.random_integers(2, 100, n) vs = np.random.random(n)*998. + 2. qs = qsturng(ps, rs, vs) t0=time.time() estimates = psturng(qs, rs, vs) import pprint pprint.pprint([(p,r,v,q) for p,r,v,q in zip(ps,rs,vs,qs)]) print time.time()-t0 actuals = 1. - ps errors = estimates - actuals pylab.figure() pylab.hist(errors, bins=100) yticks = pylab.yticks()[0] pylab.yticks(yticks, [r'$%i$'%t for t in yticks]) xticks = pylab.xticks()[0] pylab.xticks(xticks, [r'$%.0e$'%t for t in xticks]) pylab.text(0, 475,