def test_weightstats_3(self): x1_2d, x2_2d = self.x1_2d, self.x2_2d w1, w2 = self.w1, self.w2 d1w_2d = DescrStatsW(x1_2d, weights=w1) d2w_2d = DescrStatsW(x2_2d, weights=w2) x1r_2d = d1w_2d.asrepeats() x2r_2d = d2w_2d.asrepeats() assert_almost_equal(x2r_2d.mean(0), d2w_2d.mean, 14) assert_almost_equal(x2r_2d.var(0), d2w_2d.var, 14) assert_almost_equal(x2r_2d.std(0), d2w_2d.std, 14) assert_almost_equal(np.cov(x2r_2d.T, bias=1), d2w_2d.cov, 14) assert_almost_equal(np.corrcoef(x2r_2d.T), d2w_2d.corrcoef, 14) # print d1w_2d.ttest_mean(3) # #scipy.stats.ttest is also vectorized # print stats.ttest_1samp(x1r_2d, 3) t, p, d = d1w_2d.ttest_mean(3) assert_almost_equal([t, p], stats.ttest_1samp(x1r_2d, 3), 11) # print [stats.ttest_1samp(xi, 3) for xi in x1r_2d.T] cm = CompareMeans(d1w_2d, d2w_2d) ressm = cm.ttest_ind() resss = stats.ttest_ind(x1r_2d, x2r_2d) assert_almost_equal(ressm[:2], resss, 14)
def test_weightstats_3(self): x1_2d, x2_2d = self.x1_2d, self.x2_2d w1, w2 = self.w1, self.w2 d1w_2d = DescrStatsW(x1_2d, weights=w1) d2w_2d = DescrStatsW(x2_2d, weights=w2) x1r_2d = d1w_2d.asrepeats() x2r_2d = d2w_2d.asrepeats() assert_almost_equal(x2r_2d.mean(0), d2w_2d.mean, 14) assert_almost_equal(x2r_2d.var(0), d2w_2d.var, 14) assert_almost_equal(x2r_2d.std(0), d2w_2d.std, 14) assert_almost_equal(np.cov(x2r_2d.T, bias=1), d2w_2d.cov, 14) assert_almost_equal(np.corrcoef(x2r_2d.T), d2w_2d.corrcoef, 14) # print d1w_2d.ttest_mean(3) # #scipy.stats.ttest is also vectorized # print stats.ttest_1samp(x1r_2d, 3) t, p, d = d1w_2d.ttest_mean(3) assert_almost_equal([t, p], stats.ttest_1samp(x1r_2d, 3), 11) #print [stats.ttest_1samp(xi, 3) for xi in x1r_2d.T] cm = CompareMeans(d1w_2d, d2w_2d) ressm = cm.ttest_ind() resss = stats.ttest_ind(x1r_2d, x2r_2d) assert_almost_equal(ressm[:2], resss, 14)
def test_confint(testdata): result = confidence_interval(testdata, control_label='A') c_means1 = CompareMeans(DescrStatsW(testdata['kpi1']['B']), DescrStatsW(testdata['kpi1']['A'])) c_means2 = CompareMeans(DescrStatsW(testdata['kpi2']['B']), DescrStatsW(testdata['kpi2']['A'])) expected1 = c_means1.tconfint_diff() expected2 = c_means2.zconfint_diff() assert result['B']['kpi1'] == expected1 assert result['B']['kpi2'] == expected2
def test_comparemeans_convenient_interface(self): x1_2d, x2_2d = self.x1_2d, self.x2_2d d1 = DescrStatsW(x1_2d) d2 = DescrStatsW(x2_2d) cm1 = CompareMeans(d1, d2) # smoke test for summary from statsmodels.iolib.table import SimpleTable for use_t in [True, False]: for usevar in ['pooled', 'unequal']: smry = cm1.summary(use_t=use_t, usevar=usevar) assert_(isinstance(smry, SimpleTable)) # test for from_data method cm2 = CompareMeans.from_data(x1_2d, x2_2d) assert_(str(cm1.summary()) == str(cm2.summary()))
def setup_class(cls): cls.x1 = np.array( [7.8, 6.6, 6.5, 7.4, 7.3, 7., 6.4, 7.1, 6.7, 7.6, 6.8]) cls.x2 = np.array([4.5, 5.4, 6.1, 6.1, 5.4, 5., 4.1, 5.5]) cls.d1 = DescrStatsW(cls.x1) cls.d2 = DescrStatsW(cls.x2) cls.cm = CompareMeans(cls.d1, cls.d2)
def test_ztest_ztost(): # compare weightstats with separately tested proportion ztest ztost import statsmodels.stats.proportion as smprop x1 = [0, 1] w1 = [5, 15] res2 = smprop.proportions_ztest(15, 20., value=0.5) d1 = DescrStatsW(x1, w1) res1 = d1.ztest_mean(0.5) assert_allclose(res1, res2, rtol=0.03, atol=0.003) d2 = DescrStatsW(x1, np.array(w1) * 21. / 20) res1 = d2.ztest_mean(0.5) assert_almost_equal(res1, res2, decimal=12) res1 = d2.ztost_mean(0.4, 0.6) res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6) assert_almost_equal(res1[0], res2[0], decimal=12) x2 = [0, 1] w2 = [10, 10] #d2 = DescrStatsW(x1, np.array(w1)*21./20) d2 = DescrStatsW(x2, w2) res1 = ztest(d1.asrepeats(), d2.asrepeats()) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20])) #TODO: check this is this difference expected?, see test_proportion assert_allclose(res1[1], res2[1], rtol=0.03) res1a = CompareMeans(d1, d2).ztest_ind() assert_allclose(res1a[1], res2[1], rtol=0.03) assert_almost_equal(res1a, res1, decimal=12)
def testcomparison(df, smp1_cols, smp2_cols, test='ttest'): if len(smp1_cols) == 0 or len(smp2_cols) == 0: logging.warning("data not exist for comparison") else: col_stat = 'stat %s' % test col_pval = 'pval %s' % test df.loc[:, col_stat] = np.nan df.loc[:, col_pval] = np.nan for i in df.index: X = DescrStatsW(df.loc[i, smp1_cols].as_matrix()) Y = DescrStatsW(df.loc[i, smp2_cols].as_matrix()) if test == 'ttest': df.loc[i, col_stat], df.loc[i, col_pval], tmp = CompareMeans( X, Y).ttest_ind() if test == 'ztest': df.loc[i, col_stat], df.loc[i, col_pval] = CompareMeans( X, Y).ztest_ind() return df
def mean_diff_confint_ind(sample1, sample2, alpha=0.05): """Доверительный интервал разности средних для двух независимых выборок Parameters ---------- sample1 : array_like Первая выборка sample2 : array_like Вторая выборка alpha : float in (0, 1) Уровень доверия, рассчитывается как ``1-alpha`` Returns ------- lower, upper : floats Левая и правая граница доверительного интервала """ cm = CompareMeans(DescrStatsW(sample1), DescrStatsW(sample2)) return cm.tconfint_diff(alpha=alpha)
def test_means(self, population_A, population_B, sample_size_A, sample_size_B, variable, SEED): # gets a sample for each popuplation sample_A = population_A.sample(n=sample_size_A, random_state=SEED)[variable] sample_B = population_B.sample(n=sample_size_B, random_state=SEED)[variable] # prepares the tests test_pop_A = DescrStatsW(sample_A) test_pop_B = DescrStatsW(sample_B) # makes the comparison test_compare = CompareMeans(test_pop_A, test_pop_B) # tests z, p_value = test_compare.ztest_ind(alternative='larger', value=0) return p_value
def mean_method(df, target_col): df_t0 = df.loc[(df['treatment'] == 0) & (df['in_delta'] == 1)] df_t1 = df.loc[(df['treatment'] == 1) & (df['in_delta'] == 1)] df_t0 = df_t0[target_col] df_t1 = df_t1[target_col] mean_0 = df_t0.mean() mean_1 = df_t1.mean() print(f"{'=' * 10}Mean Method Results{'=' * 10}") effect = mean_1 - mean_0 print(f"Treatment effect on {target_col} is {mean_1 - mean_0}") cm = CompareMeans.from_data(df_t1.values, df_t0.values) result = cm.ttest_ind(alternative='two-sided') print(f"Two-sided T test: CI={np.round(cm.tconfint_diff(), 3)} pvalue={round(result[1], 3)}") return effect
def mean_diff_confint_ind(sample1, sample2): cm = CompareMeans(DescrStatsW(sample1), DescrStatsW(sample2)) return cm.tconfint_diff()
def test_ttest_2sample(self): x1, x2 = self.x1, self.x2 x1r, x2r = self.x1r, self.x2r w1, w2 = self.w1, self.w2 #Note: stats.ttest_ind handles 2d/nd arguments res_sp = stats.ttest_ind(x1r, x2r) assert_almost_equal( ttest_ind(x1, x2, weights=(w1, w2))[:2], res_sp, 14) #check correct ttest independent of user ddof cm = CompareMeans(DescrStatsW(x1, weights=w1, ddof=0), DescrStatsW(x2, weights=w2, ddof=1)) assert_almost_equal(cm.ttest_ind()[:2], res_sp, 14) cm = CompareMeans(DescrStatsW(x1, weights=w1, ddof=1), DescrStatsW(x2, weights=w2, ddof=2)) assert_almost_equal(cm.ttest_ind()[:2], res_sp, 14) cm0 = CompareMeans(DescrStatsW(x1, weights=w1, ddof=0), DescrStatsW(x2, weights=w2, ddof=0)) cm1 = CompareMeans(DescrStatsW(x1, weights=w1, ddof=0), DescrStatsW(x2, weights=w2, ddof=1)) cm2 = CompareMeans(DescrStatsW(x1, weights=w1, ddof=1), DescrStatsW(x2, weights=w2, ddof=2)) res0 = cm0.ttest_ind(usevar='unequal') res1 = cm1.ttest_ind(usevar='unequal') res2 = cm2.ttest_ind(usevar='unequal') assert_almost_equal(res1, res0, 14) assert_almost_equal(res2, res0, 14) #check confint independent of user ddof res0 = cm0.tconfint_diff(usevar='pooled') res1 = cm1.tconfint_diff(usevar='pooled') res2 = cm2.tconfint_diff(usevar='pooled') assert_almost_equal(res1, res0, 14) assert_almost_equal(res2, res0, 14) res0 = cm0.tconfint_diff(usevar='unequal') res1 = cm1.tconfint_diff(usevar='unequal') res2 = cm2.tconfint_diff(usevar='unequal') assert_almost_equal(res1, res0, 14) assert_almost_equal(res2, res0, 14)
def fn(control, test): c_means = CompareMeans(DescrStatsW(test), DescrStatsW(control)) if _is_proportion(control, test): return c_means.zconfint_diff() else: return c_means.tconfint_diff()
def test_ttest_2sample(self): x1, x2 = self.x1, self.x2 x1r, x2r = self.x1r, self.x2r w1, w2 = self.w1, self.w2 # Note: stats.ttest_ind handles 2d/nd arguments res_sp = stats.ttest_ind(x1r, x2r) assert_almost_equal(ttest_ind(x1, x2, weights=(w1, w2))[:2], res_sp, 14) # check correct ttest independent of user ddof cm = CompareMeans(DescrStatsW(x1, weights=w1, ddof=0), DescrStatsW(x2, weights=w2, ddof=1)) assert_almost_equal(cm.ttest_ind()[:2], res_sp, 14) cm = CompareMeans(DescrStatsW(x1, weights=w1, ddof=1), DescrStatsW(x2, weights=w2, ddof=2)) assert_almost_equal(cm.ttest_ind()[:2], res_sp, 14) cm0 = CompareMeans(DescrStatsW(x1, weights=w1, ddof=0), DescrStatsW(x2, weights=w2, ddof=0)) cm1 = CompareMeans(DescrStatsW(x1, weights=w1, ddof=0), DescrStatsW(x2, weights=w2, ddof=1)) cm2 = CompareMeans(DescrStatsW(x1, weights=w1, ddof=1), DescrStatsW(x2, weights=w2, ddof=2)) res0 = cm0.ttest_ind(usevar='unequal') res1 = cm1.ttest_ind(usevar='unequal') res2 = cm2.ttest_ind(usevar='unequal') assert_almost_equal(res1, res0, 14) assert_almost_equal(res2, res0, 14) # check confint independent of user ddof res0 = cm0.tconfint_diff(usevar='pooled') res1 = cm1.tconfint_diff(usevar='pooled') res2 = cm2.tconfint_diff(usevar='pooled') assert_almost_equal(res1, res0, 14) assert_almost_equal(res2, res0, 14) res0 = cm0.tconfint_diff(usevar='unequal') res1 = cm1.tconfint_diff(usevar='unequal') res2 = cm2.tconfint_diff(usevar='unequal') assert_almost_equal(res1, res0, 14) assert_almost_equal(res2, res0, 14)
info = max(values, key=itemgetter(2)) a = info[0] - 1 b = info[1] - 1 x = range(1, 10) n = a + 1 l = "User:"******"User:"******"Pearson:", scipy.stats.pearsonr(R[a], R[b])) print(CompareMeans(DescrStatsW(R[a]), DescrStatsW(R[b])).summary()) results = sm.OLS(R[b], R[a]).fit() print(results.summary()) for i in range(0, 9): if R[a][i] == 0: if R[b][i] >= 3: print("product", i + 1, "recommended to user", a + 1) for i in range(0, 9): if R[b][i] == 0: if R[a][i] >= 3: print("product", i + 1, "recommended to user", b + 1)
# Critério do valor p # Teste Unicaudal # Rejeitar H_0 se o valor p\leq\alpha from statsmodels.stats.weightstats import DescrStatsW, CompareMeans test_setosa = DescrStatsW(setosa) test_virginica = DescrStatsW(virginica) test_A = test_setosa.get_compare(test_virginica) z, p_valor = test_A.ztest_ind(alternative='larger', value=0) print('O valor de z é ', z) print('O p-valor é ', p_valor) test_B = CompareMeans(test_setosa, test_virginica) z, p_valor = test_B.ztest_ind(alternative='larger', value=0) print('O valor de z é ', z) print('O p-valor é ', p_valor) p_valor <= significancia # Aceitamos a hipótese nula. # Testes não Paramétricos """ Teste do Qui-Quadrado (\chi^2) Também conhecido como teste de adequação ao ajustamento, seu nome se deve ao fato de utilizar uma variável estatística padronizada, representada pela
32.6, 33.3, 32.2 ] a = 0.05 # 有意水準(デフォルト) = 1 - 信頼係数 alt = 'two-sided' # 両側検定(デフォルト) # 左片側検定なら'smaller' # 右片側検定なら'larger' d = DescrStatsW(np.array(X) - np.array(Y)) # 対標本の場合 d.ttest_mean(alternative=alt)[1] # p値 #> 0.0006415571512322235 d.tconfint_mean(alpha=a, alternative=alt) # 信頼区間 #> (-3.9955246743198867, -1.3644753256801117) c = CompareMeans(DescrStatsW(X), DescrStatsW(Y)) # 対標本でない場合 ve = 'pooled' # 等分散を仮定する(デフォルト).仮定しないなら'unequal'. c.ttest_ind(alternative=alt, usevar=ve)[1] # p値 #> 0.000978530937238609 c.tconfint_diff(alpha=a, alternative=alt, usevar=ve) # 信頼区間 #> (-4.170905570517185, -1.1890944294828283) ### 4.4.4 独立性の検定(カイ2乗検定) import pandas as pd my_url = ('https://raw.githubusercontent.com/taroyabuki' '/fromzero/master/data/smoker.csv') my_data = pd.read_csv(my_url)