def get_peruser_diff_zstat(df, nobs_name, mean_name, std_name, test_group_column='test_group', test='TEST', control='CONTROL', alpha=0.05, alternative='two-sided'): '''confidence interval for the difference in means. Similar to https://www.statsmodels.org/dev/generated/statsmodels.stats.weightstats.CompareMeans.zconfint_diff.html#statsmodels.stats.weightstats.CompareMeans.zconfint_diff Example: pre = s1.build_and_run_sql(grouping=['test_group','user_id'], sql_template=TEMPLATE_USER_STATS) get_user_agg_zstat(pre, 'units','buyers mean', 'buyers std') Parameters ---------- alpha : float significance level for the confidence interval, coverage is ``1-alpha`` alternative : string This specifies the alternative hypothesis for the test that corresponds to the confidence interval. The alternative hypothesis, H1, has to be one of the following : 'two-sided': H1: difference in means not equal to value (default) 'larger' : H1: difference in means larger than value 'smaller' : H1: difference in means smaller than value Returns ------- diff, zstat, pvalue, confint : floats ''' summary = df.set_index(test_group_column)#.T.loc[[nobs_name, mean_name, std_name],[test, control]].to_dict() d1 = summary.loc[test, :] d2 = summary.loc[control,:] diff = d1[mean_name] - d2[mean_name] std_diff = np.sqrt(d1[std_name]**2/(d1[nobs_name]-1) + d2[std_name]**2/(d2[nobs_name]-1)) #Assume unequal variance confint = wstats._zconfint_generic(diff, std_diff, alpha=alpha, alternative=alternative) zstat, pvalue = wstats._zstat_generic2(diff, std_diff, alternative) return d1[mean_name], d2[mean_name], diff/d2[mean_name], confint/d2[mean_name], zstat, pvalue
def proportions_ztest(count, nobs, value=None, alternative='two-sided', prop_var=False): '''test for proportions based on normal (z) test Parameters ---------- count : integer or array_like the number of successes in nobs trials. If this is array_like, then the assumption is that this represents the number of successes for each independent sample nobs : integer the number of trials or observations, with the same length as count. value : None or float or array_like This is the value of the null hypothesis equal to the proportion in the case of a one sample test. In the case of a two-sample test, the null hypothesis is that prop[0] - prop[1] = value, where prop is the proportion in the two samples alternative : string in ['two-sided', 'smaller', 'larger'] The alternative hypothesis can be either two-sided or one of the one- sided tests, smaller means that the alternative hypothesis is ``prop < value` and larger means ``prop > value``, or the corresponding inequality for the two sample test. prop_var : False or float in (0, 1) If prop_var is false, then the variance of the proportion estimate is calculated based on the sample proportion. Alternatively, a proportion can be specified to calculate this variance. Common use case is to use the proportion under the Null hypothesis to specify the variance of the proportion estimate. TODO: change options similar to propotion_ztost ? Returns ------- zstat : float test statistic for the z-test p-value : float p-value for the z-test Notes ----- This uses a simple normal test for proportions. It should be the same as running the mean z-test on the data encoded 1 for event and 0 for no event, so that the sum corresponds to count. In the one and two sample cases with two-sided alternative, this test produces the same p-value as ``proportions_chisquare``, since the chisquare is the distribution of the square of a standard normal distribution. (TODO: verify that this really holds) TODO: add continuity correction or other improvements for small samples. ''' prop = count * 1. / nobs k_sample = np.size(prop) if k_sample == 1: diff = prop - value elif k_sample == 2: diff = prop[0] - prop[1] - value else: msg = 'more than two samples are not implemented yet' raise NotImplementedError(msg) p_pooled = np.sum(count) * 1. / np.sum(nobs) nobs_fact = np.sum(1. / nobs) if prop_var: p_pooled = prop_var var_ = p_pooled * (1 - p_pooled) * nobs_fact std_diff = np.sqrt(var_) from statsmodels.stats.weightstats import _zstat_generic2 return _zstat_generic2(diff, std_diff, alternative)
def test_poisson_2indep(count1, exposure1, count2, exposure2, ratio_null=1, method='score', alternative='two-sided', etest_kwds=None): '''test for ratio of two sample Poisson intensities If the two Poisson rates are g1 and g2, then the Null hypothesis is - H0: g1 / g2 = ratio_null against one of the following alternatives - H1_2-sided: g1 / g2 != ratio_null - H1_larger: g1 / g2 > ratio_null - H1_smaller: g1 / g2 < ratio_null Parameters ---------- count1 : int Number of events in first sample. exposure1 : float Total exposure (time * subjects) in first sample. count2 : int Number of events in second sample. exposure2 : float Total exposure (time * subjects) in second sample. ratio: float ratio of the two Poisson rates under the Null hypothesis. Default is 1. method : string Method for the test statistic and the p-value. Defaults to `'score'`. Current Methods are based on Gu et. al 2008. Implemented are 'wald', 'score' and 'sqrt' based asymptotic normal distribution, and the exact conditional test 'exact-cond', and its mid-point version 'cond-midp'. method='etest' and method='etest-wald' provide pvalues from `etest_poisson_2indep` using score or wald statistic respectively. see Notes. alternative : string The alternative hypothesis, H1, has to be one of the following - 'two-sided': H1: ratio of rates is not equal to ratio_null (default) - 'larger' : H1: ratio of rates is larger than ratio_null - 'smaller' : H1: ratio of rates is smaller than ratio_null etest_kwds: dictionary Additional parameters to be passed to the etest_poisson_2indep function, namely ygrid. Returns ------- results : instance of HolderTuple class The two main attributes are test statistic `statistic` and p-value `pvalue`. Notes ----- - 'wald': method W1A, wald test, variance based on separate estimates - 'score': method W2A, score test, variance based on estimate under Null - 'wald-log': W3A - 'score-log' W4A - 'sqrt': W5A, based on variance stabilizing square root transformation - 'exact-cond': exact conditional test based on binomial distribution - 'cond-midp': midpoint-pvalue of exact conditional test - 'etest': etest with score test statistic - 'etest-wald': etest with wald test statistic References ---------- Gu, Ng, Tang, Schucany 2008: Testing the Ratio of Two Poisson Rates, Biometrical Journal 50 (2008) 2, 2008 See Also -------- tost_poisson_2indep etest_poisson_2indep ''' # shortcut names y1, n1, y2, n2 = count1, exposure1, count2, exposure2 d = n2 / n1 r = ratio_null r_d = r / d if method in ['score']: stat = (y1 - y2 * r_d) / np.sqrt((y1 + y2) * r_d) dist = 'normal' elif method in ['wald']: stat = (y1 - y2 * r_d) / np.sqrt(y1 + y2 * r_d**2) dist = 'normal' elif method in ['sqrt']: stat = 2 * (np.sqrt(y1 + 3 / 8.) - np.sqrt((y2 + 3 / 8.) * r_d)) stat /= np.sqrt(1 + r_d) dist = 'normal' elif method in ['exact-cond', 'cond-midp']: from statsmodels.stats import proportion bp = r_d / (1 + r_d) y_total = y1 + y2 stat = None # TODO: why y2 in here and not y1, check definition of H1 "larger" pvalue = proportion.binom_test(y1, y_total, prop=bp, alternative=alternative) if method in ['cond-midp']: # not inplace in case we still want binom pvalue pvalue = pvalue - 0.5 * stats.binom.pmf(y1, y_total, bp) dist = 'binomial' elif method.startswith('etest'): if method.endswith('wald'): method_etest = 'wald' else: method_etest = 'score' if etest_kwds is None: etest_kwds = {} stat, pvalue = etest_poisson_2indep(count1, exposure1, count2, exposure2, ratio_null=ratio_null, method=method_etest, alternative=alternative, **etest_kwds) dist = 'poisson' else: raise ValueError('method not recognized') if dist == 'normal': stat, pvalue = _zstat_generic2(stat, 1, alternative) rates = (y1 / n1, y2 / n2) ratio = rates[0] / rates[1] res = HolderTuple(statistic=stat, pvalue=pvalue, distribution=dist, method=method, alternative=alternative, rates=rates, ratio=ratio, ratio_null=ratio_null) return res
def proportions_ztest(count, nobs, value=None, alternative='two-sided', prop_var=False): """ Test for proportions based on normal (z) test Parameters ---------- count : integer or array_like the number of successes in nobs trials. If this is array_like, then the assumption is that this represents the number of successes for each independent sample nobs : integer or array-like the number of trials or observations, with the same length as count. value : float, array_like or None, optional This is the value of the null hypothesis equal to the proportion in the case of a one sample test. In the case of a two-sample test, the null hypothesis is that prop[0] - prop[1] = value, where prop is the proportion in the two samples. If not provided value = 0 and the null is prop[0] = prop[1] alternative : string in ['two-sided', 'smaller', 'larger'] The alternative hypothesis can be either two-sided or one of the one- sided tests, smaller means that the alternative hypothesis is ``prop < value` and larger means ``prop > value``, or the corresponding inequality for the two sample test. prop_var : False or float in (0, 1) If prop_var is false, then the variance of the proportion estimate is calculated based on the sample proportion. Alternatively, a proportion can be specified to calculate this variance. Common use case is to use the proportion under the Null hypothesis to specify the variance of the proportion estimate. Returns ------- zstat : float test statistic for the z-test p-value : float p-value for the z-test Examples -------- >>> count = 5 >>> nobs = 83 >>> value = .05 >>> stat, pval = proportions_ztest(count, nobs, value) >>> print('{0:0.3f}'.format(pval)) 0.695 >>> import numpy as np >>> from statsmodels.stats.proportion import proportions_ztest >>> count = np.array([5, 12]) >>> nobs = np.array([83, 99]) >>> stat, pval = proportions_ztest(counts, nobs) >>> print('{0:0.3f}'.format(pval)) 0.159 Notes ----- This uses a simple normal test for proportions. It should be the same as running the mean z-test on the data encoded 1 for event and 0 for no event so that the sum corresponds to the count. In the one and two sample cases with two-sided alternative, this test produces the same p-value as ``proportions_chisquare``, since the chisquare is the distribution of the square of a standard normal distribution. """ # TODO: verify that this really holds # TODO: add continuity correction or other improvements for small samples # TODO: change options similar to propotion_ztost ? count = np.asarray(count) nobs = np.asarray(nobs) if nobs.size == 1: nobs = nobs * np.ones_like(count) prop = count * 1. / nobs k_sample = np.size(prop) if value is None: if k_sample == 1: raise ValueError('value must be provided for a 1-sample test') value = 0 if k_sample == 1: diff = prop - value elif k_sample == 2: diff = prop[0] - prop[1] - value else: msg = 'more than two samples are not implemented yet' raise NotImplementedError(msg) p_pooled = np.sum(count) * 1. / np.sum(nobs) nobs_fact = np.sum(1. / nobs) if prop_var: p_pooled = prop_var var_ = p_pooled * (1 - p_pooled) * nobs_fact std_diff = np.sqrt(var_) from statsmodels.stats.weightstats import _zstat_generic2 return _zstat_generic2(diff, std_diff, alternative)
for Id in IdLs: BranchScoreDf.loc[Indx, Id] = abs(row[f"{Id}_Subs"] - AvgLen[f"{Id}_Subs"]) #**2 BranchScoreDf['TreeFull'] = BranchScoreDf[IdLs].sum(axis=1) BranchScoreDf.to_csv('BranchScoreDf.tsv', sep='\t', header=True, index=True, index_label='Id') BraScStatSigDf = pd.DataFrame(dtype=np.float64) ColLs = list(BranchScoreDf.columns) for Indx, row in BranchScoreDf.iterrows(): for Col in ColLs: BraScStatSigDf.at[Indx, Col] = weightstats._zstat_generic2( row[Col], BsDf.at[Indx, f"SE_{Col}"], alternative='two-sided', )[-1] / 2 BraScStatSigDf.to_csv('BraScStatSigDf.tsv', sep='\t', header=True, index=True, index_label='Id') FdrCorrection(BraScStatSigDf) BraScStatSigDf.to_csv('FdrBraScStatSigDf.tsv', sep='\t', header=True, index=True, index_label='Id')