def test_tvar(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) assert_almost_equal(stats.tvar(x), stats.mstats.tvar(xm), decimal=12) assert_almost_equal(stats.tvar(y), stats.mstats.tvar(ym), decimal=12)
def test_winsorization(self): "Tests the Winsorization of the data." data = ma.array([77, 87, 88, 114, 151, 210, 219, 246, 253, 262, 296, 299, 306, 376, 428, 515, 666, 1310, 2611]) assert_almost_equal(mstats.winsorize(data, (0.2, 0.2)).var(ddof=1), 21551.4, 1) data[5] = masked winsorized = mstats.winsorize(data) assert_equal(winsorized.mask, data.mask)
def test_obrientransform(self): args = [[5]*5+[6]*11+[7]*9+[8]*3+[9]*2+[10]*2, [6]+[7]*2+[8]*4+[9]*9+[10]*16] result = [5*[3.1828]+11*[0.5591]+9*[0.0344]+3*[1.6086]+2*[5.2817]+2*[11.0538], [10.4352]+2*[4.8599]+4*[1.3836]+9*[0.0061]+16*[0.7277]] assert_almost_equal(np.round(mstats.obrientransform(*args).T,4), result,4)
def get_distance_edges_test(): n = 10 R = 2 distance_edges = get_distance_edges(arena_radius=R, n=n) assert len(distance_edges) == n + 1 assert distance_edges[0] == 0 assert distance_edges[-1] == R def area(R): return np.pi * R * R @contract(r0="x", r1=">x") def area_between(r0, r1): return area(r1) - area(r0) should_be = area(R) / (n) # print('Should be %g' % should_be) for i in range(n): r0 = R - distance_edges[i] r1 = R - distance_edges[i + 1] strip = area_between(r1, r0) # print('Between %5.3f and %5.3f dist %g strip is %g' % (r0,r1, # r1-r0,strip)) assert_almost_equal(strip, should_be)
def test_spearmanr(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) r, p = stats.spearmanr(x, y) rm, pm = stats.mstats.spearmanr(xm, ym) assert_almost_equal(r, rm, 14) assert_almost_equal(p, pm, 14)
def test_multinomial_elementwise_distribution(self): '''Verify that the created variables approach a multinomial distribution for large numbers of samples.''' (m, n, k) = (6, 5, 1) r = 2 ** np.arange(4, 17) p = statutil.random_row_stochastic((m, n)) #p = statutil.scale_row_sums(np.ones((m, n))) error = np.zeros((len(r),)) for (i, r_val) in enumerate(r): for _ in xrange(k): x = statutil.multinomial_elementwise(p, r_val) # Root-mean-square-error of observed frequencies w.r.t. desired frequencies error[i] += statutil.norm_frobenius_scaled(statutil.hist(x, n) / (1.0 * r_val) - p) error[i] /= (1.0 * k) # Validate the model error of the central limit theorem: C*r^(-0.5). # This is a consequence of the Central Limit Theorem. We are making k experiments for # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors # should be (with 95% confidence) <= n * (1.96*s[i])^2. So # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m). # See http://en.wikipedia.org/wiki/Central_limit_theorem alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error)) c = np.exp(c) # print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - # np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, # 2) / np.sqrt(p.shape[0]), assert_almost_equal(alpha, -0.5, decimal=1, err_msg='Unexpected error term growth power') self.assertTrue(c <= 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, 2) / np.sqrt(p.shape[0]), 'Error term coefficient outside 95% confidence interval') self.assertTrue(abs(r_value) > 0.99, 'Error does not fit a power law in sample size')
def test_ibd_segments_both_parents(self): '''Test computing GERMLINE IBD segments.''' h_mat = ig._HapMatrix(self.problem, self.sibs) m = self.ibd_computer.ibd_segments(h_mat) assert_segments_almost_equal(m, [((0 , 344), (16484792, 21449028, 4.964, 0), ((2, 0), (4, 0))), ((0 , 344), (16484792, 21449028, 4.964, 0), ((1, 0), (4, 0))), ((0 , 780), (16484792, 26920270, 10.435, 0), ((4, 1), (2, 1))), ((2145, 2996), (39608193, 48759228, 9.151, 0), ((1, 1), (2, 1))), ((2145, 2996), (39608193, 48759228, 9.151, 0), ((4, 1), (2, 1))), ((885 , 3218), (27425790, 51156933, 23.731, 0), ((4, 1), (1, 1))), ((0 , 3218), (16484792, 51156933, 34.672, 0), ((2, 0), (1, 0)))], full_data=True, decimal=3, err_msg='Wrong IBD segments, raw') # A transitive-logic test test to see that we don't miss any intervals with GERMLINE m.group_to_disjoint(False) assert_segments_almost_equal(m, [((0 , 344), (16484792, 21449028, 4.964, 0), ((2, 0), (1, 0), (4, 0))), ((0 , 344), (16484792, 21449028, 4.964, 0), ((4, 1), (2, 1))), ((344 , 780), (21449028, 26920270, 5.471, 0), ((2, 0), (1, 0))), ((344 , 780), (21449028, 26920270, 5.471, 0), ((4, 1), (2, 1))), ((780 , 885), (26920270, 27425790, 0.506, 0), ((2, 0), (1, 0))), ((885 , 2145), (27425790, 39608193, 12.182, 0), ((2, 0), (1, 0))), ((885 , 2145), (27425790, 39608193, 12.182, 0), ((4, 1), (1, 1))), ((2145, 2996), (39608193, 48759228, 9.151, 0), ((4, 1), (1, 1), (2, 1))), ((2145, 2996), (39608193, 48759228, 9.151, 0), ((2, 0), (1, 0))), ((2996, 3218), (48759228, 51156933, 2.398, 0), ((2, 0), (1, 0))), ((2996, 3218), (48759228, 51156933, 2.398, 0), ((4, 1), (1, 1)))], full_data=True, decimal=3, err_msg='Wrong IBD segments, grouped') stats = np.array([(len(s.samples), s.length) for s in m]) best_segment = np.lexsort((-stats[:, 1], -stats[:, 0]))[0] assert_equal(best_segment, 7, 'Wrong best segment (IBD set size + length)') assert_almost_equal(m[best_segment].length, 9.15, decimal=2, err_msg='Wrong best segment (IBD set size + length)') assert_equal(m[best_segment].samples, set([(4, 1), (1, 1), (2, 1)]), err_msg='Wrong best segment (IBD set size + length)')
def test_tmax(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) assert_almost_equal(stats.tmax(x,2.), stats.mstats.tmax(xm,2.), 10) assert_almost_equal(stats.tmax(y,2.), stats.mstats.tmax(ym,2.), 10)
def test_zscore(self): # This is not in R, so tested by using: # (testcase[i]-mean(testcase,axis=0)) / sqrt(var(testcase)*3/4) y = mstats.zscore(self.testcase) desired = ma.fix_invalid([-1.3416407864999, -0.44721359549996, 0.44721359549996, 1.3416407864999, np.nan]) assert_almost_equal(desired, y, decimal=12)
def test_in_extended_family(self): """ Check the logic of the nulcear family membership algorithm on a sample of genotyped hutterites and their pedigree. Does not treat polygamous families correctly. """ # print len(self.g), self.g.__class__, self.g assert_almost_equal( (1.0 * sum(self.p.in_degree().itervalues())) / self.p.number_of_nodes(), 1.961, 3, "Wrong average pedigree degree", ) self.assertTrue( TestGenotypeTools.__in_extended_family(self.p, self.g, self.pedigree.node_of[106592]), "Should have been in nuclear family", ) assert_equal(self.genotype.num_samples, 1415, "Unexpected total # of genotyped persons") assert_equal( sum(TestGenotypeTools.__in_extended_family(self.p, self.g, x) for x in self.g), 736, "Unexpected # of nuclear family members", ) assert_equal( [ sum(TestGenotypeTools.__in_extended_family(self.p, self.g, x, n) for x in self.g) for n in np.arange(13, 0, -1) ], [736, 736, 736, 752, 780, 837, 870, 934, 977, 1027, 1067, 1122, 1149], "Unexpected # of nuclear family members", )
def test_kendalltau(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) r = stats.kendalltau(x, y) rm = stats.mstats.kendalltau(xm, ym) assert_almost_equal(r[0], rm[0], decimal=10) assert_almost_equal(r[1], rm[1], decimal=7)
def test_pointbiserial(self): x = [1,0,1,1,1,1,0,1,0,0,0,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,1,0, 0,0,0,0,1,-1] y = [14.8,13.8,12.4,10.1,7.1,6.1,5.8,4.6,4.3,3.5,3.3,3.2,3.0, 2.8,2.8,2.5,2.4,2.3,2.1,1.7,1.7,1.5,1.3,1.3,1.2,1.2,1.1, 0.8,0.7,0.6,0.5,0.2,0.2,0.1,np.nan] assert_almost_equal(mstats.pointbiserialr(x, y)[0], 0.36149, 5)
def test_z(self): """ not in R, so used (10-mean(testcase,axis=0))/sqrt(var(testcase)*3/4) """ y = mstats.z(self.testcase, ma.array(self.testcase).mean()) assert_almost_equal(y,0.0)
def test_kurtosis(self): # Set flags for axis = 0 and fisher=0 (Pearson's definition of kurtosis # for compatibility with Matlab) y = mstats.kurtosis(self.testmathworks, 0, fisher=0, bias=1) assert_almost_equal(y, 2.1658856802973, 10) # Note that MATLAB has confusing docs for the following case # kurtosis(x,0) gives an unbiased estimate of Pearson's skewness # kurtosis(x) gives a biased estimate of Fisher's skewness (Pearson-3) # The MATLAB docs imply that both should give Fisher's y = mstats.kurtosis(self.testmathworks, fisher=0, bias=0) assert_almost_equal(y, 3.663542721189047, 10) y = mstats.kurtosis(self.testcase, 0, 0) assert_almost_equal(y, 1.64) # test that kurtosis works on multidimensional masked arrays correct_2d = ma.array( np.array([-1.5, -3.0, -1.47247052385, 0.0, -1.26979517952]), mask=np.array([False, False, False, True, False], dtype=np.bool), ) assert_array_almost_equal(mstats.kurtosis(self.testcase_2d, 1), correct_2d) for i, row in enumerate(self.testcase_2d): assert_almost_equal(mstats.kurtosis(row), correct_2d[i]) correct_2d_bias_corrected = ma.array( np.array([-1.5, -3.0, -1.88988209538, 0.0, -0.5234638463918877]), mask=np.array([False, False, False, True, False], dtype=np.bool), ) assert_array_almost_equal(mstats.kurtosis(self.testcase_2d, 1, bias=False), correct_2d_bias_corrected) for i, row in enumerate(self.testcase_2d): assert_almost_equal(mstats.kurtosis(row, bias=False), correct_2d_bias_corrected[i]) # Check consistency between stats and mstats implementations assert_array_almost_equal_nulp(mstats.kurtosis(self.testcase_2d[2, :]), stats.kurtosis(self.testcase_2d[2, :]))
def test_sem(self): # This is not in R, so used: sqrt(var(testcase)*3/4) / sqrt(3) y = mstats.sem(self.testcase) assert_almost_equal(y, 0.6454972244) n = self.testcase.count() assert_allclose(mstats.sem(self.testcase, ddof=0) * np.sqrt(n/(n-2)), mstats.sem(self.testcase, ddof=2))
def test_var(self): """ var(testcase) = 1.666666667 """ #y = stats.var(self.shoes[0]) #assert_approx_equal(y,6.009) y = mstats.var(self.testcase) assert_almost_equal(y,1.666666667)
def test_signaltonoise(self): # This is not in R, so used: # mean(testcase, axis=0) / (sqrt(var(testcase)*3/4)) with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) y = mstats.signaltonoise(self.testcase) assert_almost_equal(y, 2.236067977)
def test_1D_float96(self): a = ma.array((1,2,3,4), mask=(0,0,0,1)) actual_dt = mstats.hmean(a, dtype=np.float96) desired_dt = np.asarray(3. / (1./1 + 1./2 + 1./3), dtype=np.float96) assert_almost_equal(actual_dt, desired_dt, decimal=14) assert_(actual_dt.dtype == desired_dt.dtype)
def test_1dpredict(self): "Basic test 1d - prediction" (E, NOx, gas_fit_E, _, _, results) = self.d gas = loess(E,NOx, span=2./3.) gas.fit() gas.predict(gas_fit_E, stderror=False) assert_almost_equal(gas.predicted.values, results[2], 6)
def test_signaltonoise(self): """ this is not in R, so used mean(testcase,axis=0)/(sqrt(var(testcase)*3/4)) """ #y = stats.signaltonoise(self.shoes[0]) #assert_approx_equal(y,4.5709967) y = mstats.signaltonoise(self.testcase) assert_almost_equal(y,2.236067977)
def test_normaltest(self): for n in self.get_n(): if n > 8: x,y,xm,ym = self.generate_xy_sample(n) r = stats.normaltest(x) rm = stats.mstats.normaltest(xm) assert_almost_equal(r[0],rm[0],10) assert_almost_equal(r[1],rm[1],10)
def test_cost(): data = ex4() X = add_bias(data['x']) y = np.vstack([data['y'], 1 - data['y']]).T g = LogisticGraph(np.array([.01, .01, .01])) assert_almost_equal(theano.function([g.x, g.y, g.l], g.cost)(X, y, 0.), 0.7785, decimal=3) g = LogisticGraph(np.array([-0.23201762, -6.826957, -13.900436])) assert_almost_equal(theano.function([g.x, g.y, g.l], g.cost)(X, y, 0.), 651.61406, decimal=3)
def test_pearsonr(self): """ test for pearsonr """ for n in self.get_n(): x,y,xm,ym = self.generate_xy_sample(n) r,p = stats.pearsonr(x,y) rm,pm = stats.mstats.pearsonr(xm,ym) assert_almost_equal(r,rm,14) assert_almost_equal(p,pm,14)
def test_2d_w_missing(self): # Test corrcoef on 2D variable w/ missing value x = self.data x[-1] = masked x = x.reshape(3, 4) test = corrcoef(x) control = np.corrcoef(x) assert_almost_equal(test[:-1, :-1], control[:-1, :-1])
def test_sem(self): """ this is not in R, so used sqrt(var(testcase)*3/4)/sqrt(3) """ #y = stats.sem(self.shoes[0]) #assert_approx_equal(y,0.775177399) y = mstats.sem(self.testcase) assert_almost_equal(y,0.6454972244)
def test_train(): data = ex4() X = add_bias(data['x']) y = np.vstack([data['y'], 1 - data['y']]).T g = LogisticGraph(np.array([.01, .01, .01])) stats = {} theano_train(g, X, y, l=0., stats=stats) assert_almost_equal(g.theta.get_value(), [-16.3787, 0.1483, 0.1589], decimal=3) assert_equal(stats['iterations'], 5)
def test_samplevar(self): """ R does not have 'samplevar' so the following was used var(testcase)*(4-1)/4 where 4 = length(testcase) """ #y = stats.samplevar(self.shoes[0]) #assert_approx_equal(y,5.4081) y = mstats.samplevar(self.testcase) assert_almost_equal(y,1.25)
def test_regress_simple(): # Regress a line with sinusoidal noise. Test for #1273. x = np.linspace(0, 100, 100) y = 0.2 * np.linspace(0, 100, 100) + 10 y += np.sin(np.linspace(0, 20, 100)) slope, intercept, r_value, p_value, sterr = mstats.linregress(x, y) assert_almost_equal(slope, 0.19644990055858422) assert_almost_equal(intercept, 10.211269918932341)
def test_describe(self): for n in self.get_n(): x, y, xm, ym = self.generate_xy_sample(n) r = stats.describe(x, ddof=1) rm = stats.mstats.describe(xm, ddof=1) for ii in range(6): assert_almost_equal(np.asarray(r[ii]), np.asarray(rm[ii]), decimal=12)
def test_sem(self): # This is not in R, so used: sqrt(var(testcase)*3/4) / sqrt(3) # Note, differs from stats.sem return due to different ddof (backwards # compat reasons). y = mstats.sem(self.testcase) assert_almost_equal(y, 0.55901699437494745) n = self.testcase.count() assert_allclose(mstats.sem(self.testcase, ddof=0) * np.sqrt(n/(n-2)), mstats.sem(self.testcase, ddof=2))
def test_ln_binomial(): for ln_binomial in (hypergeom._ln_binomial, hypergeom._ln_binomial.py_func): assert_almost_equal(np.log(comb(200, 100)), ln_binomial(200, 100), 11) assert_almost_equal(np.log(comb(5, 3)), ln_binomial(5, 3), 11) assert_almost_equal(np.log(comb(67, 32)), ln_binomial(67, 32), 11) assert ln_binomial(100, 0) == 0 assert ln_binomial(100, 100) == 0 with pytest.raises(ValueError): ln_binomial(200, 300)
def test_2d_without_missing(self): # Test corrcoef on 1 2D variable w/o missing values x = self.data.reshape(3, 4) assert_almost_equal(np.corrcoef(x), corrcoef(x)) assert_almost_equal(np.corrcoef(x, rowvar=False), corrcoef(x, rowvar=False)) with suppress_warnings() as sup: sup.filter(DeprecationWarning, "bias and ddof have no effect") assert_almost_equal(np.corrcoef(x, rowvar=False, bias=True), corrcoef(x, rowvar=False, bias=True))
def test_2d_with_missing(self): # Test corrcoef on 2D variable w/ missing value x = self.data x[-1] = masked x = x.reshape(3, 4) test = corrcoef(x) control = np.corrcoef(x) assert_almost_equal(test[:-1, :-1], control[:-1, :-1]) with suppress_warnings() as sup: sup.filter(DeprecationWarning, "bias and ddof have no effect") # ddof and bias have no or negligible effect on the function assert_almost_equal( corrcoef(x, ddof=-2)[:-1, :-1], control[:-1, :-1]) assert_almost_equal( corrcoef(x, ddof=3)[:-1, :-1], control[:-1, :-1]) assert_almost_equal( corrcoef(x, bias=1)[:-1, :-1], control[:-1, :-1])
def assert_model_matrices_almost_equal(m, m_expected, decimal=3): '''Check that a model''s matrices equal a set of expected matrices.''' (m_A, m_B, m_pi) = m.asMatrices() (A, B, pi) = m_expected assert_almost_equal(np.array(m_A), np.array(A), decimal=decimal, err_msg='Wrong transition probabilities') assert_almost_equal(np.array(m_B), np.array(B), decimal=decimal, err_msg='Wrong emission probabilities') assert_almost_equal(np.array(m_pi), np.array(pi), decimal=decimal, err_msg='Wrong initial probabilities')
def test_spearmanr(self): # Tests some computations of Spearman's rho (x, y) = ([5.05,6.75,3.21,2.66],[1.65,2.64,2.64,6.95]) assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555) (x, y) = ([5.05,6.75,3.21,2.66,np.nan],[1.65,2.64,2.64,6.95,np.nan]) (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555) x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7] y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4] assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299) x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan] y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan] (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299) # test for namedtuple attributes res = mstats.spearmanr(x, y) attributes = ('correlation', 'pvalue') check_named_results(res, attributes, ma=True)
def test_kendalltau(self): # Tests some computations of Kendall's tau x = ma.fix_invalid([5.05, 6.75, 3.21, 2.66,np.nan]) y = ma.fix_invalid([1.65, 26.5, -5.93, 7.96, np.nan]) z = ma.fix_invalid([1.65, 2.64, 2.64, 6.95, np.nan]) assert_almost_equal(np.asarray(mstats.kendalltau(x,y)), [+0.3333333,0.4969059]) assert_almost_equal(np.asarray(mstats.kendalltau(x,z)), [-0.5477226,0.2785987]) # x = ma.fix_invalid([0, 0, 0, 0,20,20, 0,60, 0,20, 10,10, 0,40, 0,20, 0, 0, 0, 0, 0, np.nan]) y = ma.fix_invalid([0,80,80,80,10,33,60, 0,67,27, 25,80,80,80,80,80,80, 0,10,45, np.nan, 0]) result = mstats.kendalltau(x,y) assert_almost_equal(np.asarray(result), [-0.1585188, 0.4128009]) # test for namedtuple attributes res = mstats.kendalltau(x, y) attributes = ('correlation', 'pvalue') check_named_results(res, attributes, ma=True)