def SBDFP_Calc(DF, FP="ECFP4", FORMAT="RDKit"): # This function requires a dataframe with a column identified as "ECFP4FP" or "MACCSFP" containing the respective fingerprints as RDKit objects # The function also requires the files ECFP4.counts or MACCS.counts that contain the "1" Bit counts for the respective fingerprints # The input dataframe can be taken from the LoadDatasetFromCSV function # FP = "ECFP4" or "MACCS" according to the respective SB-DFP # FORMAT = "RDKit" or "TEXT" according to the output format, RDKit object or TEXT string if FP == "ECFP4": FPSTEXT = [DataStructs.BitVectToText(row.ECFP4FP) for index, row in DF.iterrows()] DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))] for i in FPSTEXT: b = [int(j) for j in i] DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)] REF = open("ECFP4.counts") line = REF.readline() a = line.split(",") REF_COUNTS = [int(x) for x in a] SBDFP = [] for i in range(len(REF_COUNTS)): stat, pval = proportions_ztest([REF_COUNTS[i], DF_COUNTS[i]], [15403690,DF.shape[0]], alternative='smaller') if pval < 0.01: SBDFP.append(1) else: SBDFP.append(0) SBDFP = [str(x) for x in SBDFP] SBDFP = "".join(SBDFP) SBDFP_RDKIT = DataStructs.CreateFromBitString(SBDFP) elif FP == "MACCS": FPSTEXT = [DataStructs.BitVectToText(row.MACCSFP) for index, row in DF.iterrows()] DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))] for i in FPSTEXT: b = [int(j) for j in i] DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)] REF = open("MACCS.counts") line = REF.readline() a = line.split(",") REF_COUNTS = [int(x) for x in a] SBDFP = [] for i in range(len(REF_COUNTS)): stat, pval = proportions_ztest([REF_COUNTS[i], DF_COUNTS[i]], [15403690,DF.shape[0]], alternative='smaller') if pval < 0.01: SBDFP.append(1) else: SBDFP.append(0) SBDFP = [str(x) for x in SBDFP] SBDFP = "".join(SBDFP) SBDFP_RDKIT = DataStructs.CreateFromBitString(SBDFP) if FORMAT == "RDKit": return SBDFP_RDKIT elif FORMAT == "TEXT": return SBDFP
def perform_z_test(m1, n1, m2, n2, delta, alpha): ## null hypothesis |p1 - p2| < delta p1 = 0 if n1 == 0 else m1 / n1 p2 = 0 if n2 == 0 else m2 / n2 p_hat = (m1 + m2) / (n1 + n2) if __VERBOSE__: print('p1, p2, abs(p1 - p2), delta, se:', p1, p2, abs(p1 - p2), delta) try: if p1 > p2: stat_test = proportions_ztest([m1, m2], [n1, n2], value=delta, alternative='larger') else: stat_test = proportions_ztest([m2, m1], [n2, n1], value=delta, alternative='larger') return stat_test[1] < alpha, {M1_ATTR_NAME: m1, N1_ATTR_NAME: n1, M2_ATTR_NAME: m2, N2_ATTR_NAME: n2, P1_ATTR_NAME: p1,\ P2_ATTR_NAME: p2, 'diff': abs(p1-p2),\ 'p_hat': p_hat, STATISTICS_ATTR_NAME: stat_test[0], PVALUE_ATTR_NAME: stat_test[1],\ 'delta': delta, 'alpha': alpha, 'significant_diff': stat_test[1] < alpha} except: print('z_test assumption not met') return False, {M1_ATTR_NAME: m1, N1_ATTR_NAME: n1, M2_ATTR_NAME: m2, N2_ATTR_NAME: n2, P1_ATTR_NAME: p1, P2_ATTR_NAME: p2, 'diff': abs(p1 - p2), 'p_hat': np.nan, 'se': np.nan, STATISTICS_ATTR_NAME: np.nan, PVALUE_ATTR_NAME: np.nan, 'delta': delta, 'alpha': alpha, 'significant_diff': False}
def t_test(curDict, sub): td = {} if len(curDict['Africa']) != 0: if sub: subpop = [pop for pop in subpopulations] for pop1 in subpopulations: subpop.pop(0) for pop2 in subpop: count = np.asarray( [sum(curDict[pop1]), sum(curDict[pop2])]) nobs = np.asarray([len(curDict[pop1]), len(curDict[pop2])]) stat, P = proportions_ztest(count, nobs) # if P < alphaSub: if P not in td.keys(): td[P] = [[pop1, pop2]] else: td[P].append([pop1, pop2]) else: suppop = [pop for pop in superpopulations] for pop1 in superpopulations: suppop.pop(0) for pop2 in suppop: count = np.asarray( [sum(curDict[pop1]), sum(curDict[pop2])]) nobs = np.asarray([len(curDict[pop1]), len(curDict[pop2])]) stat, P = proportions_ztest(count, nobs) # if P < alphaSup: if P not in td.keys(): td[P] = [[pop1, pop2]] else: td[P].append([pop1, pop2]) return td
def test_proportion_ztests(): # currently only consistency test with proportions chisquare # Note: alternative handling is generic res1 = smprop.proportions_ztest(15, 20., value=0.5, prop_var=0.5) res2 = smprop.proportions_chisquare(15, 20., value=0.5) assert_almost_equal(res1[1], res2[1], decimal=13) res1 = smprop.proportions_ztest(np.asarray([15, 10]), np.asarray([20., 20]), value=0, prop_var=None) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20])) # test only p-value assert_almost_equal(res1[1], res2[1], decimal=13) # test with integers, issue #7603 res1 = smprop.proportions_ztest(np.asarray([15, 10]), np.asarray([20, 50000]), value=0, prop_var=None) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20, 50000])) # test only p-value assert_almost_equal(res1[1], res2[1], decimal=13) assert_array_less(0, res2[-1][1]) # expected should be positive
def test_proportion_ztests(): # currently only consistency test with proportions chisquare # Note: alternative handling is generic res1 = smprop.proportions_ztest(15, 20., value=0.5, prop_var=0.5) res2 = smprop.proportions_chisquare(15, 20., value=0.5) assert_almost_equal(res1[1], res2[1], decimal=13) res1 = smprop.proportions_ztest(np.asarray([15, 10]), np.asarray([20., 20]), value=0, prop_var=None) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20])) # test only p-value assert_almost_equal(res1[1], res2[1], decimal=13)
def test_hypothesis_proportions(): # source: https://sonalake.com/latest/hypothesis-testing-of-proportion-based-samples/ # can we assume anything from our sample significance = 0.05 # our sample - 89% are good sample_success = 1367 sample_size = 1520 # our Ho is 85% null_hypothesis = 0.85 # check our sample against Ho for Ha > Ho # for Ha < Ho use alternative='smaller' # for Ha != Ho use alternative='two-sided' stat, p_value = proportions_ztest(count=sample_success, nobs=sample_size, value=null_hypothesis, alternative='larger') # report print('z_stat: %0.3f, p_value: %0.3f' % (stat, p_value)) if p_value > significance: print( "Fail to reject the null hypothesis - we have nothing else to say") else: print( "Reject the null hypothesis - suggest the alternative hypothesis is true" )
def compare_proportion(p_clu, vocab, behr): term_cl = {} set_terms = set(vocab.keys()) for p_id, cl in p_clu.items(): term_cl.setdefault(cl, list()).extend(behr[p_id]) set_terms.update(behr[p_id]) term_prop = {t: [None] * len(term_cl) for t in set_terms} for cl, term in term_cl.items(): for t in set_terms: term_prop[t][int(cl)] = (term.count(t), len(term)) # ztest of proportion with bonferroni-corrected p-values s = (len(term_cl.keys()) - 1, len(term_cl.keys()) - 1) result_ztest = {t: {'pval': np.zeros(s), 'count': term_prop[t]} for t in term_prop} for t, prop_count in term_prop.items(): count_comp = 0 for cl in range(len(prop_count)): idx = cl + 1 while idx < len(prop_count): if prop_count[cl][0] != 0 or prop_count[idx][0] != 0: count = np.array([prop_count[cl][0], prop_count[idx][0]]) nobs = np.array([prop_count[cl][1], prop_count[idx][1]]) stat, pval = proportions_ztest(count, nobs) result_ztest[t]['pval'][cl][idx - 1] = pval count_comp += 1 idx += 1 result_ztest[t]['pval'] = result_ztest[t]['pval'] * count_comp return result_ztest
def test_ztest_ztost(): # compare weightstats with separately tested proportion ztest ztost import statsmodels.stats.proportion as smprop x1 = [0, 1] w1 = [5, 15] res2 = smprop.proportions_ztest(15, 20., value=0.5) d1 = DescrStatsW(x1, w1) res1 = d1.ztest_mean(0.5) assert_allclose(res1, res2, rtol=0.03, atol=0.003) d2 = DescrStatsW(x1, np.array(w1)*21./20) res1 = d2.ztest_mean(0.5) assert_almost_equal(res1, res2, decimal=12) res1 = d2.ztost_mean(0.4, 0.6) res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6) assert_almost_equal(res1[0], res2[0], decimal=12) x2 = [0, 1] w2 = [10, 10] # d2 = DescrStatsW(x1, np.array(w1)*21./20) d2 = DescrStatsW(x2, w2) res1 = ztest(d1.asrepeats(), d2.asrepeats()) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20])) # TODO: check this is this difference expected?, see test_proportion assert_allclose(res1[1], res2[1], rtol=0.03) res1a = CompareMeans(d1, d2).ztest_ind() assert_allclose(res1a[1], res2[1], rtol=0.03) assert_almost_equal(res1a, res1, decimal=12)
def pval_cal_withoutrep(df): df=df.rename(columns = {'mc2_rep1':'mc_case','mc1_rep1':'mc_cont','h2_rep1':'h_case','h1_rep1':'h_cont'}) meth_case=df.mc_case.divide(df.h_case) meth_cont=df.mc_cont.divide(df.h_cont) df['meth_diff']=meth_case.subtract(meth_cont) df['meth_case']=meth_case df['meth_cont']=meth_cont df['ztestpval'] =df.apply(lambda r: sm.proportions_ztest(np.array([r.mc_cont,r.mc_case]), np.array([r.h_cont,r.h_case]), value=0, alternative='two-sided')[1], axis=1) #df['Fisherpval'] = df.apply(lambda r: stats.fisher_exact([[r.mc, (r.h-r.mc)],[r.mc1,(r.h1-r.mc1)]])[1], axis=1) df=df.fillna(0) h=df.meth_diff.abs() mod_pval=1-df.ztestpval df['val']=h.multiply(mod_pval) hh=mod_pval.apply(np.exp) exp_val=h.multiply(hh) scaled_exp_val=(exp_val.subtract(exp_val.min())).divide(((exp_val.max())-(exp_val.min()))) smooth_exp_val=smoothing(*exp_val) scaled_smooth_exp_val=(smooth_exp_val-min(smooth_exp_val))/(max(smooth_exp_val)-min(smooth_exp_val)) df1=pd.concat([df, pd.DataFrame({"exp_val":scaled_exp_val}),pd.DataFrame({"smooth_val":scaled_smooth_exp_val})], axis=1) return df1
def get_node_info(self): '''Pivots data and extracts information from it''' pivot = self.data.groupby([self.option_column_name ])[self.click_column_name].agg({ 'opens': 'count', 'clicks': 'sum', 'mean': 'mean' }) sims = pivot.apply(lambda x: [ np.random.beta(1 + x['clicks'], 1 + x['opens']) for i in range(1000) ], axis=1) pivot['prob_of_choice'] = pd.DataFrame( list(zip(*(sims.tolist()))), columns=sims.index).idxmax( axis=1, skipna=True).value_counts() / 1000 pivot = pivot.fillna(0) incremental_clicks = np.round( (pivot['mean'].max() - self.data[self.click_column_name].mean()) * pivot['opens'].sum(), 0) z, p_value = proportions_ztest(count=pivot['clicks'], nobs=pivot['opens'], value=0, alternative='two-sided') choice = pivot['mean'].argmax() pivot = pivot.to_dict(orient='index') return (choice, incremental_clicks, p_value, pivot)
def hypothesis_testing_5(): df1 = pd.read_excel(xls, 'AlertData') df2 = pd.read_excel(xls, 'FuelInfo') #pre requirements df1_group_by = df1.groupby(['deviceId'], as_index=False) l = [] for x in df1_group_by: l.append(x) l[1][1].to_csv(l[1][0] + '.csv') df = pd.read_csv('12DF03C6:19523068255842304686.csv') df = df.loc[df['alarmType'] == 'PCW'] no_of_trials = len(df) no_of_success = len(df.loc[df['speed'] > 30]) #hypo 2 #Proportion of times the bus(id = 12DF03C6:19523068255842304686) has crossed 30kmph is less than or equal to 0.4 #NULL HYP : p <= 0.4 ALT HYP : p > 0.4 stat, pval = proportions_ztest(no_of_success, no_of_trials, 0.4, alternative='larger') print(pval) if (pval < 0.05): print( 'Null hypothesis is rejected and hence the bus is driven by careless drivers' ) else: print('Bus drivers are careful')
def simulate(self): """This simulation assumes that we are testing for an `effect` in a single experiment. Returns ------- z_stat : float The z statistic from z-test in StatsModels p_value : float [0, 1] The p-value from z-test in StatsModels effect_point_estimate : float The effect size point estimate observed in the treatment group """ n_treat = int(np.ceil(self.sample_size * self.test_split)) n_control = int(self.sample_size - n_treat) # Treatment exp_observations = [ np.random.binomial(1, (self.natural_rate + self.absolute_effect), n_treat).sum() ] # Control exp_observations.append( np.random.binomial(1, (self.natural_rate), n_control).sum()) effect_point_estimate = round( exp_observations[0] / float(n_treat) - exp_observations[1] / float(n_control), 4) z_stat, p_value = proportions_ztest(exp_observations, [n_treat, n_control]) return z_stat, p_value, effect_point_estimate
def test_ztest_ztost(): # compare weightstats with separately tested proportion ztest ztost import statsmodels.stats.proportion as smprop x1 = [0, 1] w1 = [5, 15] res2 = smprop.proportions_ztest(15, 20., value=0.5) d1 = DescrStatsW(x1, w1) res1 = d1.ztest_mean(0.5) assert_allclose(res1, res2, rtol=0.03, atol=0.003) d2 = DescrStatsW(x1, np.array(w1) * 21. / 20) res1 = d2.ztest_mean(0.5) assert_almost_equal(res1, res2, decimal=12) res1 = d2.ztost_mean(0.4, 0.6) res2 = smprop.proportions_ztost(15, 20., 0.4, 0.6) assert_almost_equal(res1[0], res2[0], decimal=12) x2 = [0, 1] w2 = [10, 10] #d2 = DescrStatsW(x1, np.array(w1)*21./20) d2 = DescrStatsW(x2, w2) res1 = ztest(d1.asrepeats(), d2.asrepeats()) res2 = smprop.proportions_chisquare(np.asarray([15, 10]), np.asarray([20., 20])) #TODO: check this is this difference expected?, see test_proportion assert_allclose(res1[1], res2[1], rtol=0.03) res1a = CompareMeans(d1, d2).ztest_ind() assert_allclose(res1a[1], res2[1], rtol=0.03) assert_almost_equal(res1a, res1, decimal=12)
def z_proportions(successes_1, trials_1, successes_2, trials_2, h1='two-sided'): """ Test for proportions based on normal (z) test Parameters ---------- successes_1, successes_2 : Number of successes of two independent samples. trials_1, trials_1 : Number of trials or observations of two independent samples. h1 : str in ['two-sided', 'smaller', 'larger'] In the two sample test, smaller means that the alternative hypothesis is ``p1 < p2`` and larger means ``p1 > p2`` where ``p1`` is the proportion of the first sample and ``p2`` of the second one. Returns ------- p_value : The p-value for the test. """ z_statistic, p_value = proportions_ztest( count=np.array([successes_1, successes_2]), nobs=np.array([trials_1, trials_2]), alternative=h1) return p_value
def z_test(df: pd.DataFrame) -> pd.DataFrame: df_sig = pd.DataFrame().reindex_like(df) # 准备空的df labels = "abcdefghjklmn" # 列字母标签 for rowIndex, row in df.iterrows(): # 遍历行 for col1, value1 in row.items(): # 遍历列 for col2, value2 in row.items(): # 再次遍历列 stat, p = proportions_ztest( np.array([value1, value2]), np.array([df[col1].sum(), df[col2].sum()]), ) # 计算P值 # print(rowIndex,col1,value1,col2,value2,p) if p < 0.05 and value1 > value2: try: df_sig.loc[rowIndex, col1] = ( df_sig.loc[rowIndex, col1] + labels[df.columns.get_loc(col2)] ) # 如果当前列显著高于对比列(p<0.05),则空df对应cell加上对比列的字母标签 except: df_sig.loc[rowIndex, col1] = labels[df.columns.get_loc( col2)] # 如果是报错意味着第一次加标记,不能字符串拼接,直接赋值 for j, col in enumerate(df_sig.columns): df_sig.rename(columns={col: col + labels[j]}, inplace=True) # 列名加上字母标签 return df_sig.fillna("")
def run_proportion_Z_test(feature): dist1 = df.loc[df.churned == 'False.', feature] dist2 = df.loc[df.churned == 'True.', feature] n1 = len(dist1) p1 = dist1.sum() n2 = len(dist2) p2 = dist2.sum() z_score, p_value = proportions_ztest([p1, p2], [n1, n2])
def proportiontestResult(): count = request.form.get('count') nob = request.form.get('nob') value = request.form.get('value') stat, pval = proportions_ztest(float(count), float(nob), float(value)) return render_template( 'one-sample-tests/proportion-test/proportion-test-result.html', z=stat, p=pval)
def test_scalar(self): count = 5 nobs = 83 value = 0.05 stat, pval = smprop.proportions_ztest(count, nobs, value=value) assert_almost_equal(stat, 0.392126026314) assert_almost_equal(pval, 0.694965098115) assert_raises(ValueError, smprop.proportions_ztest, count, nobs, value=None)
def two_sided_z_test(count, nobs): """ - Null Hypothesis(H0): means are different from two groups — two sided - Null Hypothesis(H0): u0 > u1 or u0 < u2 — one sided - Use case: Ran a fair AB test, control group got 486 clicks out of 5000 impression vs experiment group got 527 clicks out of 5000 impression. Could we say experiment group won the test? Given statistical significance as 0.95 """ return proportions_ztest(count, nobs, alternative="two-sided")
def ztest(self): prop_var = self.pooled_variance n_1 = self.d1.nobs s_1 = sum(self.d1.data) n_2 = self.d2.nobs s_2 = sum(self.d2.data) return proportions_ztest([s_1, s_2], [n_1, n_2], alternative=self.test_direction, prop_var=prop_var)
def get_ztest(data1, data2, g_var, hyp_type): '''Get the z_test and p_values scores''' c1 = data1[data1[g_var] == 1][g_var].sum() c2 = data2[data2[g_var] == 1][g_var].sum() count = [c1, c2] nobs = [len(data1), len(data2)] stat, pval = proportions_ztest(count, nobs, value=0, alternative=hyp_type) print('p-value: {0:0.3f}'.format(pval)) print('z-statistic: {}'.format(round(stat, 3)))
def test1(self): #Assume success = 5 sample = 83 hypoth = 0.05 alt = 'two-sided' #Action result = prop.proportions_ztest(success, sample, hypoth, alt) #Assert self.assertAlmostEqual(result[0], 0.392, places=3) self.assertAlmostEqual(result[1], 0.695, places=3)
def proportionsztestResult(): count1 = request.form.get('count1') count2 = request.form.get('count2') nob1 = request.form.get('nob1') nob2 = request.form.get('nob2') count = np.array([float(count1), float(count2)]) nobs = np.array([float(nob1), float(nob2)]) stat, pval = proportions_ztest(count, nobs) return render_template( 'two-sample-tests/proportions-z-test/proportions-z-test-result.html', z=stat, p=pval)
def run_proportion_z_test(feature, col='loan_status', value1='Fully Paid', value2='Default'): '''Calculate z-statistics and p-value of z-test. Feature is the feature that z-test will be performed on''' group1 = df.loc[df[col] == value1, feature] group2 = df.loc[df[col] == value2, feature] n1 = len(group1) p1 = group1.sum() n2 = len(group2) p2 = group2.sum() z_score, p_value = proportions_ztest([p1, p2], [n1, n2]) return ('z-score = {}; p-value = {}'.format(z_score, p_value))
def ztest(data: pd.DataFrame, factors: np.ndarray, levels: np.ndarray, y: str, name: str, alpha=0.05) -> dict: ''' For each factor in factors, conducts a t-test/one-way ANOVA to see if there are any differences in the means of two or more groups. Parameters ---------- data (pandas.DataFrame): df containing data of the experiment. factors (numpy.ndarray): the list of the independent variables. levels (numpy.ndarray): the matrix of factor x level. Each row represents a factor and each element in a row represents a level. y (str): the name of the dependent variable. name (str): the version of the experiment. alpha (float): ignored. Return ------- result_dict (dict): keys are factors and values are the results of statistical tests. "Experiment" key is used as an index. ''' result_dict = {'Experiment': name.replace('_', ' ').title()} filtered = data[data[y] == 1] for factor, factor_levels in zip(factors, levels): count = [] nobs = [] base_data = data base_filtered = filtered for i in range(1, len(factor_levels)): base_data = base_data[base_data[factor + '_' + factor_levels[i]] == 0] base_filtered = base_filtered[base_filtered[factor + '_' + factor_levels[i]] == 0] count.append( len(filtered[filtered[factor + '_' + factor_levels[i]] == 1])) nobs.append(len(data[data[factor + '_' + factor_levels[i]] == 1])) count.append(len(base_filtered)) nobs.append(len(base_data)) if len(count) > 2: result = proportions_chisquare(np.array(count), np.array(nobs)) else: result = proportions_ztest(np.array(count), np.array(nobs)) p = p_to_string(result[1]) result_dict[factor] = str(round(result[0], 2)) + ' ' + p return result_dict
def find_best_split_for_binary_variable(self, variable): '''searches for best value split based on p-value and incremental clicks optimized for binary type variables''' pivot = self.data.groupby([self.option_column_name, variable])[self.click_column_name].agg({ 'opens': 'count', 'clicks': 'sum', 'mean': 'mean' }).unstack(self.option_column_name) pivot = pivot.fillna(0) pivot.columns = ['_'.join(col).strip() for col in pivot.columns.values] pivot.reset_index(inplace=True) pivot['can_use'] = np.all( np.all(pivot[[s for s in pivot.columns if "opens_" in s]] >= 1000, axis=1)) pivot = pivot[pivot['can_use'] == True] if pivot.empty: return None, 0 pivot['p_value'] = pivot.apply(lambda row: proportions_ztest( count=np.array(row[[s for s in pivot.columns if "clicks_" in s]]), nobs=np.array(row[[s for s in pivot.columns if "opens_" in s]]), value=0, alternative='two-sided')[1], axis=1) pivot = pivot[pivot['p_value'] <= self.max_p_value] if pivot.empty: return None, 0 pivot['total_opens'] = pivot[[ s for s in pivot.columns if "opens_" in s ]].sum(axis=1) pivot['total_clicks'] = pivot[[ s for s in pivot.columns if "clicks_" in s ]].sum(axis=1) pivot['total_mean'] = pivot['total_clicks'] / pivot['total_opens'] pivot['total_incremental_clicks'] = ( pivot[[s for s in pivot.columns if "mean_" in s]].max(axis=1) - pivot['total_mean']) * pivot['total_opens'] total_incremental_clicks = np.round( pivot['total_incremental_clicks'].sum(), 0) split_value = 1 return split_value, total_incremental_clicks
def applying_prop_test(dataframe, reference_var, outcome_var='y', alternative='two-sided'): """ Function that apply the statistical test for difference between proportions (means of a binary variable) for two different samples. This function makes use of "proportions_ztest" function from "statsmodels" library. In the way it is implemented here, the following hypothesis are tested against each other: H0: P(outcome_var = 1|reference_var = 1) = P(outcome_var = 1|reference_var = 0) H1: P(outcome_var = 1|reference_var = 1) != P(outcome_var = 1|reference_var = 0) Where the variable "reference_var" is responsible for splitting a dataset into two different samples, while "outcome_var" is the binary variable of interest. :param dataframe: dataframe with samples for implementing the test. :type dataframe: dataframe. :param reference_var: binary variable that will split samples accross two subsets. :type reference_var: string. :param outcome_var: binary variable whose difference in proportion should be assessed. :type outcome_var: string. :param alternative: indicate whether the test is two-sided (p0 != p1) or one-sided ("smaller" for p0 < p1, "larger" for p1 > p0). :type alternative: string. :return: test statistic, p-value of the test, hypotheses being tested, relevant frequencies. :rtype: dictionary. """ oper = '<' if alternative=='smaller' else ('>' if alternative=='larger' else '!=') d0 = len(dataframe[reference_var]) - dataframe[reference_var].sum() d1 = dataframe[reference_var].sum() d0_y1 = len(dataframe[(dataframe[reference_var]==0) & (dataframe[outcome_var]==1)][reference_var]) d1_y1 = len(dataframe[(dataframe[reference_var]==1) & (dataframe[outcome_var]==1)][reference_var]) count = np.array([d0_y1, d1_y1]) nobs = np.array([d0, d1]) stat, pval = proportions_ztest(count, nobs, alternative=alternative) return {'test_stat': stat, 'p_value': pval, 'hypotheses': f'H0: P({outcome_var}=1|{reference_var}=0) = P({outcome_var}=1|{reference_var}=1)\n'\ f'H1: P({outcome_var}=1|{reference_var}=0) {oper} P({outcome_var}=1|{reference_var}=1)', 'frequencies': {f'freq({reference_var}=0)': d0, f'freq({reference_var}=1)': d1, f'freq({reference_var}=0&{outcome_var}=1)': d0_y1, f'freq({reference_var}=1&{outcome_var}=1)': d1_y1}}
def tek_orneklem_oran_testi(self): count = int(input("Gözlenmiş başarı sayısı(exp=25) kişi yorum yaptı")) nobs = stats.describe(self.choice_array).nobs #gözlem sayısı value = 0.04 # sınanacak olan null hipotezimizin değeri p_value = float(proportions_ztest(count, nobs, value)[1]) if p_value < 0.05: #HO hipoteti red edilir return "(P_value=" + str( p_value ) + ") 0.04 oranında web sitemize dönüşüm yoktur(Verilen ilk degerle aralarında farklılık var demektir)" else: return "(P_value=" + str( p_value ) + ") 0.04 oranında web sitemize dönüş vardır(Verilen ilk degerle aralarında farklılık yok demektir)"
def iki_orneklem_oran_testi(self): detaya_bakilma_sayisi = np.array([500, 600]) goruntulenme_sayisi = np.array([1700, 1800]) iki_orneklem_oran_testi_result = float( proportions_ztest(detaya_bakilma_sayisi, goruntulenme_sayisi)[1]) if iki_orneklem_oran_testi_result < 0.05: return "P_value değeri= " + str( iki_orneklem_oran_testi_result ) + " olduğundan dolayı HO hipotezi(İki oran arasında anlamlı bir farklılık yoktur) red edilir " else: return "P_value değeri= " + str( iki_orneklem_oran_testi_result ) + " olduğundan dolayı HO hipotezi(İki oran arasında anlamlı bir farklılık yoktur) kabul edilir"
def plot_power(min_diff, prob_b, size_a, size_b, significance=0.05): """illustrating power through a two-tailed hypothesis test obtains the z-score for the minimum detectable difference using proportion_ztest distribution for the null hypothesis, h0 and alternative hypothesis, h1 points that are greater than the zscore for the specified significance level power is the area after the threshold, i.e. 1 - the cumulative distribution function of that point""" prob_a = prob_b + min_diff count_a = size_a * prob_a count_b = size_b * prob_b counts = np.array([count_a, count_b]) nobs = np.array([size_a, size_b]) zscore, _ = proportions_ztest(counts, nobs, alternative='two-sided') h0 = stats.norm(loc=0, scale=1) h1 = stats.norm(loc=zscore, scale=1) x = np.linspace(-5, 6, num=100) threshold = h0.ppf(1 - significance / 2) mask1 = (x > threshold) mask2 = (x < -threshold) power = np.round(1 - h1.cdf(threshold), 2) hypotheses = [h1, h0] labels = ['$H_1$ is true', '$H_0$ is true'] for hypothesis, label in zip(hypotheses, labels): y = hypothesis.pdf(x) line = plt.plot(x, y, label=label) plt.fill_between(x=x[mask1], y1=0.0, y2=y[mask1], alpha=0.2, color=line[0].get_color()) plt.fill_between(x=x[mask2], y1=0.0, y2=y[mask2], alpha=0.2, color=line[0].get_color()) title = 'p1: {}, p2: {}, size1: {}, size2: {}, power: {}' plt.title(title.format(prob_a, prob_b, size_a, size_b, power), fontdict={'fontsize': 15}) plt.ylabel('Probability') plt.xlabel('') plt.legend() plt.tight_layout() plt.show()
def expansion_stats(): human = load_cf('Human').query('variance == "constant" and not is_term').expand model = load_cf('OptimalPlusPure').query('variance == "constant" and not is_term').expand write_tex('expansion_human', f'{100*human.mean():.1f}\\%') # write_tex(f'expansion_human', mean_std(100*human.groupby('wid').mean(), fmt='pct', digits=0)) write_tex('expansion_optimal', f'{100*model.mean():.1f}\\%') z, p = proportions_ztest([human.sum(), model.sum()], [len(human), len(model)]) write_tex("expansion_test", rf"$z={z:.1f},\ {pval(p)}$") write_tex("jump", f'{expansion.jump.mean()*100:.1f}\%') # write_tex("jump", mean_std(expansion.groupby('wid').jump.mean()*100, fmt='pct')) m = logit(f'jump.astype(int) ~ gain_z', data=expansion).fit() write_tex(f'expansion_logistic', rf'$\beta = {m.params.gain_z:.3f},\ {pval(m.pvalues.gain_z)}$')
def get_pvals(self, col, target): target_prop = target.mean() # Baseline level_counts = col.value_counts() # counts # Bin category levels that account for less than <thresh> of the total data to_bin = level_counts.index[level_counts < self.thresh] col[col.isin(to_bin)] = 'Other' # Record the bins self.to_bin[col.name] = to_bin # Get a p-value for each proportion df = pd.concat([col, target], axis=1) agg = df.groupby(col.name)['target'].aggregate({ 'count': lambda x: x.sum(), 'nobs': lambda x: x.count() }) zscores = agg.apply(lambda x: proportions_ztest(x['count'], x['nobs'], target_prop)[0], axis=1) return defaultdict(int, zscores)
ConferenceAccepted = len(data[((data.Segment == "Conference") & (data.Accept == 1))]) OtherAccepted = len(data[((data.Segment == "Other") & (data.Accept == 1))]) VacationAccepted = len(data[((data.Segment == "Vacation") & (data.Accept == 1))]) TotalAccepted = len(data[(data.Accept == 1)]) import statsmodels.stats.proportion as sm TotalAcceptedPercent = TotalAccepted / Total print("BusinessLong") print("Average: " + str(BusinessLongAccepted / BusinessLong)) BusinessLongCI = sm.proportion_confint(BusinessLongAccepted, BusinessLong) print("Lower: " + str(BusinessLongCI[0])) print("Upper: " + str(BusinessLongCI[1])) print(sm.proportions_ztest(BusinessLongAccepted, BusinessLong, TotalAcceptedPercent)[1]) print("BusinessShort") print("Average: " + str(BusinessShortAccepted / BusinessShort)) BusinessShortCI = sm.proportion_confint(BusinessShortAccepted, BusinessShort) print("Lower: " + str(BusinessShortCI[0])) print("Upper: " + str(BusinessShortCI[1])) print(sm.proportions_ztest(BusinessShortAccepted, BusinessShort, TotalAcceptedPercent)[1]) print("Conference") print("Average: " + str(ConferenceAccepted / Conference)) ConferenceCI = sm.proportion_confint(ConferenceAccepted, Conference) print("Lower: " + str(ConferenceCI[0])) print("Upper: " + str(ConferenceCI[1])) print(sm.proportions_ztest(ConferenceAccepted, Conference, TotalAcceptedPercent)[1])
def z_test(driver_1, n1, driver_2, n2): count = np.array([driver_1, driver_2]) nobs = np.array([n1, n2]) z, p = proportions_ztest(count, nobs, value=0, alternative = 'larger') print ('z-stat = {z} \n p-value = {p}'.format(z=z,p=p))
def test_default_values(self): count = np.array([5, 12]) nobs = np.array([83, 99]) stat, pval = smprop.proportions_ztest(count, nobs, value=None) assert_almost_equal(stat, -1.4078304151258787) assert_almost_equal(pval, 0.15918129181156992)