def chi_squared_test(self, p_columnDEP, p_columnINDEP, p_maxNumAttributes = 6): tableREL, results, exp = rp.crosstab(p_columnDEP, p_columnINDEP, prop='col', test= 'chi-square', expected_freqs= True) tableABS = rp.crosstab(p_columnDEP, p_columnINDEP) ct = pd.crosstab(p_columnDEP, p_columnINDEP, normalize ='columns').reset_index() ct = ct.transpose() stacked = ct.stack().reset_index().rename(columns={0:'value'}) stacked = eval('stacked[stacked.' + p_columnINDEP.name + '!= p_columnDEP.name]') stacked = stacked.rename(columns={"level_1": p_columnDEP.name}) _myPlot = eval('sns.barplot(x=stacked.' + p_columnINDEP.name + ', y=stacked.value, hue=stacked.' + p_columnDEP.name + ')') a = str(len(tableABS.columns) - 1) + ' attributes' a0 = '--------------------------------------------------------------------------------------' a1 = '----------------------------------------------------------------- OBSERVED FREQUENCIES' a2 = '------------------------------------------------------ OBSERVED FREQUENCIES (relative)' a3 = '----------------------------------------------------------------- EXPECTED FREQUENCIES' a4 = '------------------------------------------------------------------------- TEST RESULTS' a5 = '--------------------------------------------------------------------------------- PLOT' if len(tableABS.columns) - 1 > p_maxNumAttributes: b1 = '--- not plotted --- number of attributes with ' + str(len(tableABS.columns) - 1) + ' is too big ---------------------------' return [a, a0, a1, a0, b1, a2, b1, a3, b1, a4, results, a5, _myPlot] else: return [a, a0, a1, a0, tableABS, a0, a2, a0, tableREL, a0, a3, a0, round(exp,0), a0, a4, a0, results, a0, a5, a0, _myPlot]
def RQ1(vul_dict): frame = {'fix_landing': [], 'client_landing': []} counter = {'major': 0, 'minor': 0, 'patch': 0} for vul in vul_dict: fix_landing = vul_dict[vul]['package']['release_type'] counter[fix_landing] += 1 for client in vul_dict[vul]['client']: if client['adoption'] != None: frame['fix_landing'].append(fix_landing + '_landing') frame['client_landing'].append( client['adoption'] + '_client_landing' if client['adoption'] != 'removal' else 'removal') df = pd.DataFrame(frame) crosstab, res = researchpy.crosstab(df['client_landing'], df['fix_landing'], test='chi-square') print('=' * 50) print( '(RQ1) A contingency table shows the frequency distribution of client-side fixing release update for each fixing release update.' ) print('=' * 50) print(crosstab) print(res) print(counter) print('=' * 50) print()
def do_different_samples_match(test): core = ThesisCore(in_package=True) core.create_reprs(kmer=3) for name, repr in core._repr_core.reprs.items(): temp_df = repr.repr_df.copy() for indi, i in enumerate(range(0, 6000, 600)): for indj, j in enumerate(range(0, 6000, 600)): if indj <= indi: continue temp_df1, temp_df2 = temp_df[i:i + 600], temp_df[j:j + 600] if test == 'permutation': p_value = get_permutation_of_pdists(temp_df1, temp_df2) print(p_value) elif test == 'KS': p_value = get_KS_of_pdists(temp_df1, temp_df2) if p_value > 0.05: print('we Cannot reject the H0 that the 2 dists', indi, indj, 'are identical with pvalue', p_value) else: print('we Reject the H0 that the 2 dists ', indi, indj, 'are identical with pvalue', p_value) elif test == 'crosstab': res = researchpy.crosstab(temp_df1, temp_df2, test="chi-square") print(res) print(res['p-value'])
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None): df_prop, df_resp = get_prop_resp(treatment) df_prop_ref, df_resp_ref = get_prop_resp("t12a") prop_values = metric(df_resp["min_offer"], df_prop["offer"]) prop_value = metrics.get_mean(prop_values) prop_dss_values = metric(df_resp["min_offer"], df_prop["offer_dss"]) prop_dss_value = metrics.get_mean(prop_dss_values) prop_values_ref = metric(df_resp_ref["min_offer"], df_prop_ref["offer_dss"]) prop_value_ref = metrics.get_mean(prop_values_ref) # auto_dss_values = metric(df_resp["min_offer"], df_prop["ai_offer"]) # auto_dss_value = metrics.get_mean(auto_dss_values) dof = 0 diff = None if is_categorical: table, res = rp.crosstab(pd.Series(prop_values_ref), pd.Series(prop_dss_values), test='chi-square') s, p, r = res.results.values test_label = f"(pearson chi2)" test_label = f"chi2" print("Conclusion: ", generate_cat_stat_sentence(np.mean(resp_dss_values), np.std(resp_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment+".dss", label2="t20.dss")) else: table, res = rp.ttest(pd.Series(prop_values_ref), pd.Series(prop_dss_values), paired=False) s = res.results[2] if alternative=="greater": p = res.results[4] elif alternative == "less": p = res.results[5] elif alternative in (None, 'two-sided'): p = res.results[3] r = res.results[9] diff = res.results[0] dof = res.results[1] print("Conclusion: ", generate_stat_sentence(np.mean(resp_dss_values), np.std(resp_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment+".dss", label2="t20.dss")) test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}" print("RESUME: ", res) if as_percentage: res = { "Proposer + DSS": f'{100 * prop_dss_value:.2f} %', "T10": f'{100 * prop_value_ref:.2f} %', } else: res = { "Proposer + DSS": f'{prop_dss_value:.2f}', "T10": f'{prop_value_ref:.2f}', } if is_categorical: res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})" else: res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
def cramersV_matrix(catFeat, X): ''' Function that takes in the list of categorical features and returns the Cramer's V matrix which is similar to chi^2 except it adjusts for sample size ''' assocMatrix = [] for col1 in catFeat: for col2 in catFeat: tab, res = rp.crosstab(X[col1], X[col2], test = 'chi-square') assocMatrix.append(res.iloc[2,1]) assocMatrix = np.array(assocMatrix).reshape((len(catFeat),-1)) return assocMatrix
def chi_square_test_for_each_category_level(self, p_columnDEP, p_columnINDEP): g = pd.get_dummies(p_columnINDEP) h = pd.DataFrame(p_columnDEP) k = h.join(g) k = k.reset_index().drop(['index'], axis=1) temp = pd.DataFrame(columns = ['AttrCharacteristic','AttrCharacteristicRenamed','ChiSq','p_value','CramerV']) for i in range(1,len(k.columns)): _from = k.columns[i] _to = re.sub('[^A-Za-z0-9_]+', '_', k.columns[i]) k = k.rename(columns = { _from: _to }) a, results = rp.crosstab(k[p_columnDEP.name], k[k.columns[i]], test= 'chi-square') temp = temp.append({'AttrCharacteristic': _from, 'AttrCharacteristicRenamed': _to, 'ChiSq': results.iloc[0,1], 'p_value': results.iloc[1,1], 'CramerV': results.iloc[2,1]} , ignore_index=True) return temp.sort_values(by='CramerV', ascending=False, na_position='first')
def apply_chi_squared_statistic_on_timeline(self, p_df, p_columnDEP, p_columnINDEP, p_dates): temp = p_df.copy(deep=True) temp2 = None temp_i = None function_start = time.time() print('Starting:' + '\t' + 'date: ' + '\t' + time.ctime()) # create empty dataset temp2 = pd.DataFrame(columns = ['Starting_Date', 'ChiSq', 'p_value', 'CramerV']) for i in p_dates: starting = time.time() temp_i = temp[temp['starting_Date'] == i] ct, results = rp.crosstab(temp_i[p_columnDEP], temp_i[p_columnINDEP], test= 'chi-square') temp2 = temp2.append({ 'Starting_Date': i, 'ChiSq': results.iloc[0,1], 'p_value': results.iloc[1,1], 'CramerV': results.iloc[2,1]}, ignore_index=True) print('\n' + 'Duration: ' + '\t' + '{:5.3f}s'.format(time.time() - function_start) + '\n') return temp2
import pandas as pd import researchpy as rp import matplotlib.pyplot as plt df = pd.read_csv('imdb_data_clean.csv', delimiter=';') # CROSSTABULATION table, results = rp.crosstab(df['sfxgenre'], df['productionlocation'], prop='col', test='chi-square') print(table) # Prints crosstab print() print(results) # Prints statistics tab # STACKED BAR CHART ct = pd.crosstab(df['productionlocation'], df['sfxgenre'], normalize='index') # Create PD crosstab as basis for plot ct.plot.bar(stacked=True) # Generate plot plt.legend(loc='lower left', bbox_to_anchor=(0.0, 1.01), frameon=False) # Tidy up legend (other) plt.xticks(rotation="horizontal") # Horizontal x-axis labels plt.xlabel('Production location') # Adding label on x-axis plt.ylabel('Proportion SFX') # Adding label on y-axis plt.savefig('stackedbar.pdf') # Saving figure plt.clf() # Clearing figure
import pandas as pd import researchpy as rp import matplotlib.pyplot as plt from scipy import stats df = pd.read_csv( 'http://www.digitalanalytics.id.au/static/files/youtube_vevo_clean.csv', delimiter=',') # Set max of rows to show, in/decrease to needs pd.set_option('max_rows', 9999) # CROSSTABULATION table, results = rp.crosstab(df['view_cat'], df['lyric_video'], prop='col', test='chi-square') print(table) # Prints crosstab print() print(results) # Prints statistics tab # STACKED BAR CHART ct = pd.crosstab(df['lyric_video'], df['view_cat'], normalize='index') # Create PD crosstab as basis for plot ct.plot.bar(stacked=True) # Generate plot plt.legend(loc='lower left', bbox_to_anchor=(0.0, 1.01), frameon=False) # Tidy up legend (other) plt.xticks(rotation="horizontal") # Horizontal x-axis labels plt.xlabel('Lyric video') # Adding label on x-axis plt.ylabel('Viewer categories') # Adding label on y-axis
# print(crosstab) # print(res) dfNormailTrail = df[df['noiseNumber'] != 'special'] dfSpecialTrail = df[df['noiseNumber'] == 'special'] statDF = pd.DataFrame() statDF['firstIntentionConsistFinalGoalNormal'] = dfNormailTrail.groupby( 'name')["firstIntentionConsistFinalGoal"].mean() statDF['firstIntentionConsistFinalGoalSpecail'] = dfSpecialTrail.groupby( 'name')["firstIntentionConsistFinalGoal"].mean() # chi-squre resultDf = dfSpecialTrail crosstab, res = researchpy.crosstab( resultDf['participantsType'], resultDf['firstIntentionConsistFinalGoal'], test="chi-square") print(crosstab) print(res) # χ2 (2) = 14.93, P < 0.001, VCramer = 0.50 # fisher # resultDf = resultDf[resultDf['participantsType'] != "machine"] # crosstab, res = researchpy.crosstab(resultDf['participantsType'], resultDf['firstIntentionConsistFinalGoal'], test='fisher') # print(crosstab) # print(res) # df['trialType'] = [ 'Critical Disruption' if trial == "special" else 'Random Disruptions'
participants = ['human', 'RL'] dataPaths = [os.path.join(resultsPath, participant) for participant in participants] dfList = [pd.concat(map(pd.read_csv, glob.glob(os.path.join(dataPath, '*.csv'))), sort=False) for dataPath in dataPaths] df = pd.concat(dfList) df['participantsType'] = ['machine' if 'max' in name else 'human' for name in df['name']] df['isDecisionStepInZone'] = df.apply(lambda x: isDecisionStepInZone(eval(x['trajectory']), eval(x['target1']), eval(x['target2']), x['decisionSteps']), axis=1) dfExpTrail = df[(df['targetDiff'] == 0) & (df['conditionName'] == 'expCondition') & (df['decisionSteps'] != 10) & (df['decisionSteps'] != 6)] dfExpTrail['hasAvoidPoint'] = dfExpTrail.apply(lambda x: isTrajHasAvoidPoints(eval(x['trajectory']), eval(x['aimAction']), eval(x['playerGrid']), eval(x['target1']), eval(x['target2']), x['decisionSteps'], x['conditionName']), axis=1) resultDf = df[df['participantsType'] == 'human'] crosstab, res = researchpy.crosstab(resultDf['hasAvoidPoint'], resultDf['decisionSteps'], test="chi-square") print(crosstab) print(res) # human vs machine in diff decision steps[2,4,6] # decisionStep = 4 # dfExpTrail = df[(df['decisionSteps'] == decisionStep) & (df['targetDiff'] == 0) & (df['conditionName'] == 'expCondition')] # dfExpTrail['hasAvoidPoint'] = dfExpTrail.apply(lambda x: isTrajHasAvoidPoints(eval(x['trajectory']), eval(x['aimAction']), eval(x['playerGrid']), eval(x['target1']), eval(x['target2']), x['decisionSteps'], x['conditionName']), axis=1) resultDf = dfExpTrail crosstab, res = researchpy.crosstab(resultDf['participantsType'], resultDf['isDecisionStepInZone'], test="chi-square") print(crosstab) print(res) df["firstIntentionConsistFinalGoal"] = df.apply(lambda x: calculateFirstIntentionConsistency(eval(x['goal'])), axis=1)
# %% #chi-square analysis chi_exp = experiment_results.apply(share.count_frequencies, axis=1)[10:13] chi_control = control_results.apply(share.count_frequencies, axis=1)[10:13] chi_exp_ag = chi_exp.sum(axis=0) chi_control_ag = chi_control.sum(axis=0) chi = pd.concat([chi_exp_ag, chi_control_ag], axis=1, keys=['exp', 'con']) chi2, p, dof, expected = chi2_contingency(chi) # %% #draw bar chart for one variable data = chi_exp_ag.to_dict() names = list(data.keys()) values = list(data.values()) fig, axs = plt.subplots(figsize=(5, 3)) axs.bar(names, values) #%%c #check the chi-sqaure between perception1 and perception2 crosstab, res, expected = researchpy.crosstab( experiment_results.loc['perception1'], experiment_results.loc['perception2'], test='chi-square', expected_freqs=True) # %%
transformed_data = transformed_data.drop(['campaign'],axis = 1) transformed_data['campaign'] = np.log(cleaned_data['campaign']).replace([np.inf, -np.inf], 0) transformed_data['campaign'].skew() for i in numeric_col: print (i,':',transformed_data[i].skew()) # Bivariata analysis ## catagorical vaiable import researchpy as rp category=['job','education','marital','default','housing','loan','contact','month','poutcome'] for c in category : table, results = rp.crosstab(data[c],data['y'], test= 'chi-square') print(results) print('='*45) # croosstab with catagorical for c in category : table = pd.crosstab(data[c],data['y']) table.plot(kind='bar') # for numeric variable corelation = data.corr() ax=plt.subplots(figsize=(9,7)) sns.heatmap(corelation,annot = True) ### multivaite analysis
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None): df_prop, df_resp = get_prop_resp(treatment) df_prop_t10, df_resp_t10 = get_prop_resp("t10a") metric_values = metric(df_prop) metric_value = metrics.get_mean(metric_values) metric_t10_values = metric(df_prop_t10) metric_value_t10 = metrics.get_mean(metric_t10_values) metric_values = metrics.get_data(metric_values) metric_t10_values = metrics.get_data(metric_t10_values) #print(stats.chisquare(metric_values[:103], metric_t10_values[:103])) dof = 0 diff = None print(metric.__name__) if is_categorical: #table, res = rp.crosstab(pd.Series(metric_values), pd.Series(metric_t10_values), test='g-test') table, res = rp.crosstab(pd.Series(metric_values), pd.Series(metric_t10_values), test='fisher') #print(table, res) #s, p, r = res.results s = res.results[0] p = res.results[1] r = res.results[4] test_label = f"(g-test chi2)" print( "Conclusion: ", generate_cat_stat_sentence(np.mean(metric_t10_values), np.std(metric_t10_values), np.mean(metric_values), np.std(metric_values), s, p, dof, diff=diff, label1="t10a.dss", label2=treatment + ".dss")) print( pd.crosstab(pd.Series(metric_t10_values), pd.Series(metric_values))) else: table, res = rp.ttest(pd.Series(metric_t10_values), pd.Series(metric_values), paired=False) s = res.results[2] if alternative == "greater": p = res.results[4] elif alternative == "less": p = res.results[5] elif alternative in (None, 'two-sided'): p = res.results[3] r = res.results[9] diff = res.results[0] dof = res.results[1] s = res.results[2] p = res.results[3] r = res.results[9] print( "Conclusion: ", generate_stat_sentence(np.mean(metric_t10_values), np.std(metric_t10_values), np.mean(metric_values), np.std(metric_values), s, p, dof, diff=diff, label1="t10a.dss", label2=treatment + ".dss")) test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}" print("TABLE: ", table) print("TEST: ", res) if as_percentage: res = { "Proposer + DSS": f'{100 * metric_value:.2f} %', "T10": f'{100 * metric_value_t10:.2f} %', } else: res = { "Proposer + DSS": f'{metric_value:.2f}', "T10": f'{metric_value_t10:.2f}', } if is_categorical: res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})" else: res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
ctype_type = pd.crosstab(index=train_cat_df["Cover_Type"], columns=train_cat_df["Wilderness_Area1"]) ctype_type # In[26]: stats.chi2_contingency(ctype_type) # In[27]: table, results = rp.crosstab(train_cat_df["Cover_Type"], train_cat_df["Wilderness_Area1"], prop= 'col', test= 'chi-square') table # In[28]: print(results) # In[29]: for col in train_cat_df.columns[:-1]: table, results = rp.crosstab(train_cat_df["Cover_Type"], train_cat_df[col], prop= 'col', test= 'chi-square') if(results.results[1]<0.05):
def cramer(df): table, results = rp.crosstab(df['image'], df['type'], prop='col', test='chi-square')
else: return ("N") df_train["Age_above65_female"] = df_train[["Age", "Sex"]].apply(age_female, axis=1) print("變數_Ageabove65_female\n", df_train) # Case03_透過昨天課程的內容,驗證產生的兩個新變數,哪一個和目標變數(Survived_cate) 的相關性較高? # Survived_cate 和 兩變數(Age_above65、Age_above65_female)關係都是離散vs離散,因此採用Cramer’s V 係數 le = preprocessing.LabelEncoder() df_train['Age_above65'] = le.fit_transform(df_train['Age_above65']) above65_table = pd.crosstab(df_train["Age_above65"], df_train['Survived_cate']) df_above65_table = min(above65_table.shape[0], above65_table.shape[1]) - 1 above65_table, res_above65 = researchpy.crosstab(df_train["Age_above65"], df_train['Survived_cate'], test='chi-square') df_train['Age_above65_female'] = le.fit_transform( df_train['Age_above65_female']) above65_female_table = pd.crosstab(df_train["Age_above65_female"], df_train['Survived_cate']) df_above65_female_table = min(above65_female_table.shape[0], above65_female_table.shape[1]) - 1 above65_female_table, res_above65_female = researchpy.crosstab( df_train["Age_above65_female"], df_train['Survived_cate'], test='chi-square') res = {} res["Survived_vs_Age_above65"] = [
from itertools import permutations import researchpy cols = df.columns[:19] perm = permutations(cols, 2) outP = {"indeX": list(), "colS": list(), "valueS": list()} ii = 0 for hh in list(perm)[67:]: col1 = mm[hh[0]] col2 = mm[hh[1]] if mm.dtypes[hh[0]].name == "float64": col1 = pd.qcut(col1, 5) if mm.dtypes[hh[1]].name == "float64": col2 = pd.qcut(col2, 5) a, b = researchpy.crosstab(col1, col2, test="chi-square") outP["valueS"].append(b.iloc[2, 1]) outP["indeX"].append(hh[0]) outP["colS"].append(hh[1]) ii += 1 print(ii) rouP = pd.DataFrame(outP) mmK = rouP.pivot_table(index="indeX", columns="colS", values="valueS") mmK.fillna(1, inplace=True) mmK.to_pickle("heatmap.pkl") fig = plt.figure(figsize=(10, 10)) sns.heatmap(mmK, cmap="BuPu") plt.savefig("heatmap_1.png")
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None): df_prop, df_resp = get_prop_resp(treatment) prop_values = metric(df_resp["min_offer"], df_prop["offer"]) prop_value = metrics.get_mean(prop_values) prop_dss_values = metric(df_resp["min_offer"], df_prop["offer_dss"]) prop_dss_value = metrics.get_mean(prop_dss_values) auto_dss_values = metric(df_resp["min_offer"], df_prop["ai_offer"]) auto_dss_value = metrics.get_mean(auto_dss_values) dof = 0 diff = None print(metric) if is_categorical: table = pd.crosstab(prop_values, prop_dss_values) # print("TABLE: ", table) # checked using: http://vassarstats.net/propcorr.html # s, p = sms2.mcnemar(prop_values, prop_dss_values, exact=False, correction=False) table, res = rp.crosstab(prop_values, prop_dss_values, test='mcnemar') #chi, p, s = (res.results.values) s, p, r = (res.results.values) print("Conclusion: ", generate_stat_sentence(np.mean(prop_values), np.std(prop_values), np.mean(prop_dss_values), np.std(prop_dss_values), s, p, dof, diff=diff, label1=treatment, label2=treatment+".dss")) test_label = f"(mcnemar - chi2)" else: s, p = stats.wilcoxon(prop_values, prop_dss_values, alternative=alternative or 'two-sided') table, res = rp.ttest(pd.Series(prop_values), pd.Series(prop_dss_values), paired=True) #res = rp.ttest(pd.Series(prop_values), pd.Series(prop_dss_values), paired=True) diff = res.results[0] dof = res.results[1] s = res.results[2] p = res.results[3] r = res.results[9] test_label = f"(ttest dependent)" print("Conclusion: ", generate_stat_sentence(np.mean(prop_values), np.std(prop_values), np.mean(prop_dss_values), np.std(prop_dss_values), s, p, dof, diff=diff, label1=treatment, label2=treatment+".dss")) print("TABLE:", table) print("RES:", res) if as_percentage: res = { "Proposer": f'{100 * prop_value:.2f} %', "Proposer + DSS": f'{100 * prop_dss_value:.2f} %', # "prop:dss - prop": f'{100 * (prop_dss_value - prop_value):.2f} %', } else: res = { "Proposer": f'{prop_value:.2f}', "Proposer + DSS": f'{prop_dss_value:.2f}', # "prop:dss - prop": f'{(prop_dss_value - prop_value):.2f} %', } if is_categorical: res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})" else: res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
def mcnemar(self, df): """ This function performs the mcnemar test and returns a df with the results """ table, res = rp.crosstab(df['perm'], df['t_test'], test='mcnemar') return res
dv = "Age" between = "Survived_cate" aov = pg.anova(dv=dv, between=between, data=df_train, detailed=True) etaSq = aov.SS[0] / (aov.SS[0] + aov.SS[1]) print("Q1: 透過數值法計算 Age 和 Survived 是否有相關性? A:連續與離散") print("Eta Squared (η2)結果:%.3f 相關性 %s" % (etaSq, valiate_etaSq(etaSq))) # print("Cramer's Values 結果 ", res.loc[2, 'results'], valiate_etaSq(res.loc[2, 'results'])) dt = 'Survived_cate' between = 'Sex' # step1: 用交叉列連表(contingency table),來整理兩個類別型的資料 contTable = pd.crosstab(df_train[between], df_train[dt]) df = min(contTable.shape[0], contTable.shape[1]) - 1 crosstab, res = researchpy.crosstab(df_train[between], df_train[dt], test='chi-square') print() print("Q2:透過數值法計算 Sex 和 Survived 是否有相關性? A:離散與離散") print("Cramer's 相關性%.3f 結果 %s" % (res.loc[2, 'results'], judgment_CramerV(df, res.loc[2, 'results']))) print() # 連續與連續 Pearson def judgment_PearsonV(corr): if corr < .1: qual = "無線性相關" elif corr < .4: qual = "低度線性相關" elif corr < .7:
McNemar's Test """ xls = pd.ExcelFile("formcnemar18.xlsx") sports = xls.sheet_names df = {sport: xls.parse(sport) for sport in sports} rankings = df["NHL"].columns.tolist()[-15:-1] # mcnemar by league mcnemar_league = {} for sport in sports: mcnemar_league[sport] = np.ones([len(rankings), len(rankings), 2]) for i, r1 in enumerate(rankings): for j, r2 in enumerate(rankings): if r1 != r2: _, res = rp.crosstab(df[sport][r1], df[sport][r2], test="mcnemar") chisq, p, _ = res.results.values mcnemar_league[sport][i, j] = np.array([chisq, p]) # write to excel writer = pd.ExcelWriter('mcnemar_league.xlsx', engine='xlsxwriter') # define sheet names chisq_sheets = [sport + "_chisq" for sport in sports] p_sheets = [sport + "_p-value" for sport in sports] # loop for each sport for sport, chisq_sheet, p_sheet in zip(sports, chisq_sheets, p_sheets): # chi_sq chisq_df = pd.DataFrame(mcnemar_league[sport][:, :, 0], index=rankings,
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None): df_prop, df_resp = get_prop_resp(treatment) df_prop_t20, df_resp_t20 = get_prop_resp("t20a") # prop_values = metric(df_resp["min_offer_dss"], df_prop["offer"]) # prop_value = metrics.get_mean(prop_values) prop_dss_values = metric(df_resp["min_offer_dss"], df_prop["offer_dss"]) prop_dss_value = metrics.get_mean(prop_dss_values) auto_dss_values = metric(df_resp_t20["min_offer_dss"], df_prop_t20["ai_offer"]) auto_dss_value = metrics.get_mean(auto_dss_values) dof = 0 diff = None if is_categorical: # table = np.array([np.bincount(prop_values), np.bincount(prop_dss_values)]) # print("TABLE: ", table) # checked using: http://vassarstats.net/propcorr.html # s, p = sms2.mcnemar(prop_values, prop_dss_values, exact=False, correction=False) table, res = rp.crosstab(prop_dss_values, auto_dss_values, test='mcnemar') s, p, r = res.results.values test_label = f"(mcnemar) H0: equal, Ha: {'two-sided'}" print( "Conclusion: ", generate_cat_stat_sentence(np.mean(prop_dss_values), np.std(prop_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment + ".dss", label2="t20.dss")) else: #s, p = stats.wilcoxon(prop_values, auto_dss_values, alternative=alternative or 'two-sided') table, res = rp.ttest(pd.Series(prop_dss_values), pd.Series(auto_dss_values), paired=False) test_label = f"(wilcoxon) H0: equal, Ha: {alternative or 'two-sided'}" diff = res.results[0] dof = res.results[1] s = res.results[2] p = res.results[3] r = res.results[9] print( "Conclusion: ", generate_stat_sentence(np.mean(prop_dss_values), np.std(prop_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment + ".dss", label2="t20.dss")) if as_percentage: res = { # "Proposer": f'{100 * prop_value:.2f} %', "Proposer + DSS": f'{100 * prop_dss_value:.2f} %', "T20 Auto DSS": f'{100 * auto_dss_value:.2f} %', "prop:dss - auto prop": f'{100 * (prop_dss_value - auto_dss_value):.2f} %', } else: res = { # "Proposer": f'{prop_value:.2f}', "Proposer + DSS": f'{prop_dss_value:.2f}', "T20 Auto DSS": f'{auto_dss_value:.2f}', "prop:dss - auto prop": f'{(prop_dss_value - auto_dss_value):.2f} %', } if is_categorical: res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})" else: res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
def get_info_accuracy(treatment, con, dfs=None, use_percentage=None, use_labels=None): if treatment in ("t13a", "t13"): ref = "t12a" elif treatment in ("t11a", "t11b"): ref = "t10b" else: ref = treatment df_prop, df_resp = get_prop_resp(treatment) df_prop_ref, df_resp_ref = get_prop_resp(ref) if SELECTION == "prop": values = df_prop["feedback_accuracy"] values_ref = df_prop_ref["feedback_accuracy"] else: values = df_resp["feedback_fairness"] values_ref = df_resp_ref["feedback_fairness"] # feedback_fairness values_ref = values_ref.apply(lambda x: AI_FEEDBACK_ACCURACY_SCALAS_REV.get(x, x)) values = values.apply(lambda x: AI_FEEDBACK_ACCURACY_SCALAS_REV.get(x, x)) # print("DIFF: ", values, values_ref) # resp_values = metrics.get_data(metrics.get_rel_min_offer_df(df_resp)) # resp_ref_values = metrics.get_data(metrics.get_rel_min_offer_df(df_resp_ref)) values print("MEDIAN: ", values.median(), values_ref.median()) dof = 0 diff = 0 table, res = rp.crosstab(pd.Series(values), pd.Series(values_ref), test='g-test') s, p, r = res.results.values # s = res.results[2] # p = res.results[3] # r = res.results[9] # diff = res.results[0] # dof = res.results[1] # s = res.results[2] # p = res.results[3] # r = res.results[9] tmp_res = None tmp_res = stats.mannwhitneyu(values, values_ref, use_continuity=False) # tmp_res = stats.ranksums(values, values_ref) print("TMP values: ", tmp_res) print("Conclusion: ", generate_stat_sentence(np.mean(values_ref), np.std(values_ref), np.mean(values), np.std(values), s, p, dof, diff=diff, label1="t12.dss", label2=treatment+".dss")) print("Table:", table) print("Res:", res) res = { "rel. min_offer T12": metrics.get_mean(values_ref), "rel. min_offer T13": metrics.get_mean(values), # "rejection_ratio": rejection_ratio(df_prop) } test_label = f"(ttest independent) H0: equal" res = {k: (f"{v:.3f}" if pd.notnull(v) and v!= int(v) else v) for k,v in res.items()} res["min_offer" + test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
statDF['ShowCommitmentPercent'] = statDF.apply( lambda x: 1 - x['avoidCommitPercent'], axis=1) statDF = statDF.reset_index() statDF['participantsType'] = [ 'RL Agent' if 'max' in name else 'Human' for name in statDF['name'] ] # statDF['sem'] = df.groupby(['participantsType', 'decisionSteps'])["avoidCommitPercent"].apply(calculateSE) statDF = statDF[statDF['participantsType'] == 'Human'] # statDF = statDF[statDF['participantsType'] == 'RL Agent'] crosstab, res = researchpy.crosstab(dfExpTrail['hasAvoidPoint'], dfExpTrail['decisionSteps'], test="chi-square") print(crosstab) # Compute the two-way mixed-design ANOVA calAnova = 0 if calAnova: import pingouin as pg aov = pg.mixed_anova(dv='ShowCommitmentPercent', within='decisionSteps', between='participantsType', subject='name', data=statDF) pg.print_table(aov)
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None): df_prop, df_resp = get_prop_resp(treatment) df_prop_ref, df_resp_ref = get_prop_resp("t11a") print(metric.__name__) metric_values = metric(df_prop) metric_value = metrics.get_mean(metric_values) metric_ref_values = metric(df_prop_ref) metric_value_ref = metrics.get_mean(metric_ref_values) metric_values = metrics.get_data(metric_values) metric_ref_values = metrics.get_data(metric_ref_values) dof = 0 diff = None if is_categorical: table, res = rp.crosstab(pd.Series(metric_ref_values), pd.Series(metric_values), test='chi-square') s, p, r = res.results.values print( "Conclusion: ", generate_cat_stat_sentence(np.mean(metric_ref_values), np.std(metric_ref_values), np.mean(metric_values), np.std(metric_values), s, p, dof, diff=diff, label1="t11a.dss", label2=treatment + ".dss")) test_label = f"(pearson chi2)" else: #print("Ranksums", stats.ranksums(metric_ref_values, metric_values)) table, res = rp.ttest(pd.Series(metric_ref_values), pd.Series(metric_values), paired=False) s = res.results[2] if alternative == "greater": p = res.results[4] elif alternative == "less": p = res.results[5] elif alternative in (None, 'two-sided'): p = res.results[3] r = res.results[9] diff = res.results[0] dof = res.results[1] s = res.results[2] p = res.results[3] r = res.results[9] print( "Conclusion: ", generate_stat_sentence(np.mean(metric_ref_values), np.std(metric_ref_values), np.mean(metric_values), np.std(metric_values), s, p, dof, diff=diff, label1="t11a.dss", label2=treatment + ".dss")) test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}" print("RESUME: ", res) print("TABLE: ", table) if as_percentage: res = { # "Proposer": f'{100 * prop_value:.2f} %', "Proposer + DSS": f'{100 * metric_value:.2f} %', "T11A ": f'{100 * metric_value_ref:.2f} %', "prop:dss - auto prop": f'{100 * (metric_value - metric_value_ref):.2f} %', } else: res = { # "Proposer": f'{prop_value:.2f}', "Proposer + DSS": f'{metric_value:.2f}', "T11A": f'{metric_value_ref:.2f}', "prop:dss - auto prop": f'{(metric_value - metric_value_ref):.2f} %', } res[test_label] = f"{s:.3f} ({p:.3f})" if is_categorical: res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})" else: res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})" return res
plt.ylabel('PDF(IMDB_Class Rating)') plt_params_x = imdb_db.class_rating.value_counts().index plt_params_y = imdb_db.class_rating.value_counts().values sb.barplot(plt_params_x, plt_params_y) imdb_db.class_rating.value_counts() # In[35]: # sb.barplot(imdb_db.class_rating.values, hue='color', data=imdb_db) # In[36]: import researchpy as rp from scipy import stats table, results, expected = rp.crosstab(imdb_db['color'], imdb_db['class_rating'], test='chi-square', expected_freqs=True) table, results #Just to verify with Chi-square test of independance if the categorical values 'color' and 'black and white' are independance. #p-value of <=0.05 indicates, there is indeed independance. So we keep the attribute 'color' and move ahead # In[37]: imdb_db.columns # In[38]: # import researchpy as rp # from scipy import stats # table, results, expected = rp.crosstab(imdb_db['aspect_ratio'], imdb_db['class_rating'], test= 'chi-square', expected_freqs=True) # table , results
qual = 'Small' elif etaSq < .14: qual = 'Medium' else: qual = 'Large' return (qual) print(" Age 和 Survived 是否有相關性?", judgment_etaSq(etaSq)) # Case02_透過數值法計算 Sex 和 Survived_cate 是否有相關性? contTable = pd.crosstab(df_train['Sex'], df_train['Survived_cate']) print(contTable) df = min(contTable.shape[0], contTable.shape[1]) - 1 crosstab, res = researchpy.crosstab(df_train['Sex'], df_train['Survived_cate'], test='chi-square') print("Cramer's value is", res.loc[2, 'results']) ## 寫一個副程式判斷相關性的強度 def judgment_CramerV(df, V): if df == 1: if V < 0.10: qual = 'negligible' elif V < 0.30: qual = 'small' elif V < 0.50: qual = 'medium' else: qual = 'large'