예제 #1
0
	def chi_squared_test(self, p_columnDEP, p_columnINDEP, p_maxNumAttributes = 6):
		tableREL, results, exp = rp.crosstab(p_columnDEP, p_columnINDEP, prop='col', test= 'chi-square', expected_freqs= True)
		tableABS = rp.crosstab(p_columnDEP, p_columnINDEP)

		ct = pd.crosstab(p_columnDEP, p_columnINDEP, normalize ='columns').reset_index()
		ct = ct.transpose()

		stacked = ct.stack().reset_index().rename(columns={0:'value'})
		stacked = eval('stacked[stacked.' + p_columnINDEP.name + '!= p_columnDEP.name]')
		stacked = stacked.rename(columns={"level_1": p_columnDEP.name})

		_myPlot = eval('sns.barplot(x=stacked.' + p_columnINDEP.name + ', y=stacked.value, hue=stacked.' + p_columnDEP.name + ')')

		a  = str(len(tableABS.columns) - 1) + ' attributes'
		a0 = '--------------------------------------------------------------------------------------'
		a1 = '----------------------------------------------------------------- OBSERVED FREQUENCIES'
		a2 = '------------------------------------------------------ OBSERVED FREQUENCIES (relative)'
		a3 = '----------------------------------------------------------------- EXPECTED FREQUENCIES'
		a4 = '------------------------------------------------------------------------- TEST RESULTS'
		a5 = '--------------------------------------------------------------------------------- PLOT'

		if len(tableABS.columns) - 1 > p_maxNumAttributes:
			b1 = '--- not plotted --- number of attributes with ' + str(len(tableABS.columns) - 1) + ' is too big ---------------------------'
			return [a, a0, a1, a0, b1, a2, b1, a3, b1, a4, results, a5, _myPlot]  
		else:
			return [a, a0, a1, a0, tableABS, a0, a2, a0, tableREL, a0, a3, a0, round(exp,0), a0, a4, a0, results, a0, a5, a0, _myPlot]  
예제 #2
0
def RQ1(vul_dict):
    frame = {'fix_landing': [], 'client_landing': []}
    counter = {'major': 0, 'minor': 0, 'patch': 0}
    for vul in vul_dict:
        fix_landing = vul_dict[vul]['package']['release_type']
        counter[fix_landing] += 1
        for client in vul_dict[vul]['client']:
            if client['adoption'] != None:
                frame['fix_landing'].append(fix_landing + '_landing')
                frame['client_landing'].append(
                    client['adoption'] + '_client_landing'
                    if client['adoption'] != 'removal' else 'removal')
    df = pd.DataFrame(frame)
    crosstab, res = researchpy.crosstab(df['client_landing'],
                                        df['fix_landing'],
                                        test='chi-square')
    print('=' * 50)
    print(
        '(RQ1) A contingency table shows the frequency distribution of client-side fixing release update for each fixing release update.'
    )
    print('=' * 50)
    print(crosstab)
    print(res)
    print(counter)
    print('=' * 50)
    print()
예제 #3
0
def do_different_samples_match(test):
    core = ThesisCore(in_package=True)
    core.create_reprs(kmer=3)
    for name, repr in core._repr_core.reprs.items():
        temp_df = repr.repr_df.copy()
        for indi, i in enumerate(range(0, 6000, 600)):
            for indj, j in enumerate(range(0, 6000, 600)):
                if indj <= indi:
                    continue
                temp_df1, temp_df2 = temp_df[i:i + 600], temp_df[j:j + 600]
                if test == 'permutation':
                    p_value = get_permutation_of_pdists(temp_df1, temp_df2)
                    print(p_value)
                elif test == 'KS':
                    p_value = get_KS_of_pdists(temp_df1, temp_df2)
                    if p_value > 0.05:
                        print('we Cannot reject the H0 that the 2 dists', indi,
                              indj, 'are identical with pvalue', p_value)
                    else:
                        print('we Reject the H0 that the 2 dists ', indi, indj,
                              'are identical with pvalue', p_value)
                elif test == 'crosstab':
                    res = researchpy.crosstab(temp_df1,
                                              temp_df2,
                                              test="chi-square")
                    print(res)
                    print(res['p-value'])
예제 #4
0
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None):
    df_prop, df_resp = get_prop_resp(treatment)
    df_prop_ref, df_resp_ref = get_prop_resp("t12a")
    prop_values = metric(df_resp["min_offer"], df_prop["offer"])
    prop_value = metrics.get_mean(prop_values)

    prop_dss_values = metric(df_resp["min_offer"], df_prop["offer_dss"])
    prop_dss_value = metrics.get_mean(prop_dss_values)


    prop_values_ref = metric(df_resp_ref["min_offer"], df_prop_ref["offer_dss"])
    prop_value_ref = metrics.get_mean(prop_values_ref)

    # auto_dss_values = metric(df_resp["min_offer"], df_prop["ai_offer"])
    # auto_dss_value = metrics.get_mean(auto_dss_values)

    dof = 0
    diff = None
    if is_categorical:
        table, res = rp.crosstab(pd.Series(prop_values_ref), pd.Series(prop_dss_values), test='chi-square')
        s, p, r = res.results.values
        
        test_label = f"(pearson chi2)"
        
        test_label = f"chi2"
        print("Conclusion: ", generate_cat_stat_sentence(np.mean(resp_dss_values), np.std(resp_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment+".dss", label2="t20.dss"))
    else:
        table, res = rp.ttest(pd.Series(prop_values_ref), pd.Series(prop_dss_values), paired=False)
        s = res.results[2]
        if alternative=="greater":
            p = res.results[4]
        elif alternative == "less":
            p = res.results[5]
        elif alternative in (None, 'two-sided'):
            p = res.results[3]
        r = res.results[9]
        
        diff = res.results[0] 
        dof = res.results[1]
        print("Conclusion: ", generate_stat_sentence(np.mean(resp_dss_values), np.std(resp_dss_values), np.mean(auto_dss_values), np.std(auto_dss_values), s, p, dof, diff=diff, label1=treatment+".dss", label2="t20.dss"))

        
        test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}"
    print("RESUME: ", res)
    if as_percentage:
        res = {
            "Proposer + DSS": f'{100 * prop_dss_value:.2f} %',
            "T10": f'{100 * prop_value_ref:.2f} %',
        }
    else:
        res = {
            "Proposer + DSS": f'{prop_dss_value:.2f}',
            "T10": f'{prop_value_ref:.2f}',
        }
    if is_categorical:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})"
    else:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
예제 #5
0
def cramersV_matrix(catFeat, X):
    '''
    Function that takes in the list of categorical features and returns 
        the Cramer's V matrix which is similar to chi^2 except it adjusts for
        sample size
    '''
    assocMatrix = []
    for col1 in catFeat:
        for col2 in catFeat:
            tab, res = rp.crosstab(X[col1], X[col2], test = 'chi-square')
            assocMatrix.append(res.iloc[2,1])
    assocMatrix = np.array(assocMatrix).reshape((len(catFeat),-1))
    
    return assocMatrix
예제 #6
0
	def chi_square_test_for_each_category_level(self, p_columnDEP, p_columnINDEP):
		g = pd.get_dummies(p_columnINDEP)
		h = pd.DataFrame(p_columnDEP)
		k = h.join(g)
		k = k.reset_index().drop(['index'], axis=1)
		
		temp = pd.DataFrame(columns = ['AttrCharacteristic','AttrCharacteristicRenamed','ChiSq','p_value','CramerV'])
		
		for i in range(1,len(k.columns)):
			_from = k.columns[i]
			_to = re.sub('[^A-Za-z0-9_]+', '_', k.columns[i])
			k = k.rename(columns = { _from: _to })
			a, results = rp.crosstab(k[p_columnDEP.name], k[k.columns[i]], test= 'chi-square')
			temp = temp.append({'AttrCharacteristic':			_from,
								'AttrCharacteristicRenamed':	_to,
								'ChiSq':						results.iloc[0,1],
								'p_value':						results.iloc[1,1],
								'CramerV':						results.iloc[2,1]}
								, ignore_index=True)
		return temp.sort_values(by='CramerV', ascending=False, na_position='first')		
예제 #7
0
	def apply_chi_squared_statistic_on_timeline(self, p_df, p_columnDEP, p_columnINDEP, p_dates):
		temp = p_df.copy(deep=True)
		temp2 = None
		temp_i = None
		
		function_start = time.time()
		print('Starting:' + '\t' + 'date: ' + '\t' + time.ctime())
		
		# create empty dataset
		temp2 = pd.DataFrame(columns = ['Starting_Date', 'ChiSq', 'p_value', 'CramerV'])
		
		for i in p_dates:
			starting = time.time()
			temp_i = temp[temp['starting_Date'] == i]
			ct, results = rp.crosstab(temp_i[p_columnDEP], temp_i[p_columnINDEP], test= 'chi-square')
			temp2 = temp2.append({	'Starting_Date':	i,
									'ChiSq':			results.iloc[0,1],
									'p_value':			results.iloc[1,1],
									'CramerV':			results.iloc[2,1]}, ignore_index=True)
		
		print('\n' + 'Duration: ' + '\t' + '{:5.3f}s'.format(time.time() - function_start) + '\n')
		return temp2
import pandas as pd
import researchpy as rp
import matplotlib.pyplot as plt

df = pd.read_csv('imdb_data_clean.csv', delimiter=';')

# CROSSTABULATION
table, results = rp.crosstab(df['sfxgenre'],
                             df['productionlocation'],
                             prop='col',
                             test='chi-square')

print(table)  # Prints crosstab
print()
print(results)  # Prints statistics tab

# STACKED BAR CHART
ct = pd.crosstab(df['productionlocation'], df['sfxgenre'],
                 normalize='index')  # Create PD crosstab as basis for plot
ct.plot.bar(stacked=True)  # Generate plot
plt.legend(loc='lower left', bbox_to_anchor=(0.0, 1.01),
           frameon=False)  # Tidy up legend (other)
plt.xticks(rotation="horizontal")  # Horizontal x-axis labels
plt.xlabel('Production location')  # Adding label on x-axis
plt.ylabel('Proportion SFX')  # Adding label on y-axis
plt.savefig('stackedbar.pdf')  # Saving figure
plt.clf()  # Clearing figure
예제 #9
0
import pandas as pd
import researchpy as rp
import matplotlib.pyplot as plt
from scipy import stats

df = pd.read_csv(
    'http://www.digitalanalytics.id.au/static/files/youtube_vevo_clean.csv',
    delimiter=',')

# Set max of rows to show, in/decrease to needs
pd.set_option('max_rows', 9999)

# CROSSTABULATION
table, results = rp.crosstab(df['view_cat'],
                             df['lyric_video'],
                             prop='col',
                             test='chi-square')

print(table)  # Prints crosstab
print()
print(results)  # Prints statistics tab

# STACKED BAR CHART
ct = pd.crosstab(df['lyric_video'], df['view_cat'],
                 normalize='index')  # Create PD crosstab as basis for plot
ct.plot.bar(stacked=True)  # Generate plot
plt.legend(loc='lower left', bbox_to_anchor=(0.0, 1.01),
           frameon=False)  # Tidy up legend (other)
plt.xticks(rotation="horizontal")  # Horizontal x-axis labels
plt.xlabel('Lyric video')  # Adding label on x-axis
plt.ylabel('Viewer categories')  # Adding label on y-axis
예제 #10
0
    # print(crosstab)
    # print(res)

    dfNormailTrail = df[df['noiseNumber'] != 'special']
    dfSpecialTrail = df[df['noiseNumber'] == 'special']

    statDF = pd.DataFrame()
    statDF['firstIntentionConsistFinalGoalNormal'] = dfNormailTrail.groupby(
        'name')["firstIntentionConsistFinalGoal"].mean()
    statDF['firstIntentionConsistFinalGoalSpecail'] = dfSpecialTrail.groupby(
        'name')["firstIntentionConsistFinalGoal"].mean()

    # chi-squre
    resultDf = dfSpecialTrail
    crosstab, res = researchpy.crosstab(
        resultDf['participantsType'],
        resultDf['firstIntentionConsistFinalGoal'],
        test="chi-square")

    print(crosstab)
    print(res)
    # χ2 (2) = 14.93, P < 0.001, VCramer = 0.50

    # fisher
    #     resultDf = resultDf[resultDf['participantsType'] != "machine"]
    #     crosstab, res = researchpy.crosstab(resultDf['participantsType'], resultDf['firstIntentionConsistFinalGoal'], test='fisher')
    #     print(crosstab)
    #     print(res)

    #
    df['trialType'] = [
        'Critical Disruption' if trial == "special" else 'Random Disruptions'
예제 #11
0
    participants = ['human', 'RL']

    dataPaths = [os.path.join(resultsPath, participant) for participant in participants]
    dfList = [pd.concat(map(pd.read_csv, glob.glob(os.path.join(dataPath, '*.csv'))), sort=False) for dataPath in dataPaths]

    df = pd.concat(dfList)
    df['participantsType'] = ['machine' if 'max' in name else 'human' for name in df['name']]
    df['isDecisionStepInZone'] = df.apply(lambda x: isDecisionStepInZone(eval(x['trajectory']), eval(x['target1']), eval(x['target2']), x['decisionSteps']), axis=1)

    dfExpTrail = df[(df['targetDiff'] == 0) & (df['conditionName'] == 'expCondition') & (df['decisionSteps'] != 10) & (df['decisionSteps'] != 6)]

    dfExpTrail['hasAvoidPoint'] = dfExpTrail.apply(lambda x: isTrajHasAvoidPoints(eval(x['trajectory']), eval(x['aimAction']), eval(x['playerGrid']), eval(x['target1']), eval(x['target2']), x['decisionSteps'], x['conditionName']), axis=1)

    resultDf = df[df['participantsType'] == 'human']
    crosstab, res = researchpy.crosstab(resultDf['hasAvoidPoint'], resultDf['decisionSteps'], test="chi-square")
    print(crosstab)
    print(res)

    # human vs machine in diff decision steps[2,4,6]
    # decisionStep = 4
    # dfExpTrail = df[(df['decisionSteps'] == decisionStep) & (df['targetDiff'] == 0) & (df['conditionName'] == 'expCondition')]

    # dfExpTrail['hasAvoidPoint'] = dfExpTrail.apply(lambda x: isTrajHasAvoidPoints(eval(x['trajectory']), eval(x['aimAction']), eval(x['playerGrid']), eval(x['target1']), eval(x['target2']), x['decisionSteps'], x['conditionName']), axis=1)

    resultDf = dfExpTrail
    crosstab, res = researchpy.crosstab(resultDf['participantsType'], resultDf['isDecisionStepInZone'], test="chi-square")
    print(crosstab)
    print(res)

    df["firstIntentionConsistFinalGoal"] = df.apply(lambda x: calculateFirstIntentionConsistency(eval(x['goal'])), axis=1)
예제 #12
0
# %%
#chi-square analysis
chi_exp = experiment_results.apply(share.count_frequencies, axis=1)[10:13]
chi_control = control_results.apply(share.count_frequencies, axis=1)[10:13]

chi_exp_ag = chi_exp.sum(axis=0)
chi_control_ag = chi_control.sum(axis=0)

chi = pd.concat([chi_exp_ag, chi_control_ag], axis=1, keys=['exp', 'con'])
chi2, p, dof, expected = chi2_contingency(chi)

# %%
#draw bar chart for one variable
data = chi_exp_ag.to_dict()
names = list(data.keys())
values = list(data.values())

fig, axs = plt.subplots(figsize=(5, 3))
axs.bar(names, values)

#%%c
#check the chi-sqaure between perception1 and perception2
crosstab, res, expected = researchpy.crosstab(
    experiment_results.loc['perception1'],
    experiment_results.loc['perception2'],
    test='chi-square',
    expected_freqs=True)

# %%
transformed_data = transformed_data.drop(['campaign'],axis = 1)
transformed_data['campaign'] = np.log(cleaned_data['campaign']).replace([np.inf, -np.inf], 0)
transformed_data['campaign'].skew()


for i in numeric_col:
    print (i,':',transformed_data[i].skew())

# Bivariata analysis

## catagorical vaiable
import researchpy as rp
category=['job','education','marital','default','housing','loan','contact','month','poutcome']
for c in category :
    table, results = rp.crosstab(data[c],data['y'], test= 'chi-square')
    print(results)
    print('='*45)

# croosstab with catagorical
for c in category :
    table = pd.crosstab(data[c],data['y'])
    table.plot(kind='bar')

# for numeric variable
corelation = data.corr()
ax=plt.subplots(figsize=(9,7))
sns.heatmap(corelation,annot = True)


### multivaite analysis
예제 #14
0
def _get_prop_vs_prop_dss_score(treatment,
                                con,
                                dfs=None,
                                use_percentage=None,
                                use_labels=None,
                                metric=None,
                                as_percentage=None,
                                is_categorical=None,
                                alternative=None):
    df_prop, df_resp = get_prop_resp(treatment)
    df_prop_t10, df_resp_t10 = get_prop_resp("t10a")

    metric_values = metric(df_prop)
    metric_value = metrics.get_mean(metric_values)

    metric_t10_values = metric(df_prop_t10)
    metric_value_t10 = metrics.get_mean(metric_t10_values)

    metric_values = metrics.get_data(metric_values)
    metric_t10_values = metrics.get_data(metric_t10_values)
    #print(stats.chisquare(metric_values[:103], metric_t10_values[:103]))

    dof = 0
    diff = None

    print(metric.__name__)
    if is_categorical:
        #table, res = rp.crosstab(pd.Series(metric_values), pd.Series(metric_t10_values), test='g-test')
        table, res = rp.crosstab(pd.Series(metric_values),
                                 pd.Series(metric_t10_values),
                                 test='fisher')
        #print(table, res)
        #s, p, r = res.results
        s = res.results[0]
        p = res.results[1]
        r = res.results[4]

        test_label = f"(g-test chi2)"

        print(
            "Conclusion: ",
            generate_cat_stat_sentence(np.mean(metric_t10_values),
                                       np.std(metric_t10_values),
                                       np.mean(metric_values),
                                       np.std(metric_values),
                                       s,
                                       p,
                                       dof,
                                       diff=diff,
                                       label1="t10a.dss",
                                       label2=treatment + ".dss"))
        print(
            pd.crosstab(pd.Series(metric_t10_values),
                        pd.Series(metric_values)))
    else:

        table, res = rp.ttest(pd.Series(metric_t10_values),
                              pd.Series(metric_values),
                              paired=False)
        s = res.results[2]
        if alternative == "greater":
            p = res.results[4]
        elif alternative == "less":
            p = res.results[5]
        elif alternative in (None, 'two-sided'):
            p = res.results[3]
        r = res.results[9]

        diff = res.results[0]
        dof = res.results[1]
        s = res.results[2]
        p = res.results[3]
        r = res.results[9]

        print(
            "Conclusion: ",
            generate_stat_sentence(np.mean(metric_t10_values),
                                   np.std(metric_t10_values),
                                   np.mean(metric_values),
                                   np.std(metric_values),
                                   s,
                                   p,
                                   dof,
                                   diff=diff,
                                   label1="t10a.dss",
                                   label2=treatment + ".dss"))
        test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}"
    print("TABLE: ", table)
    print("TEST: ", res)

    if as_percentage:
        res = {
            "Proposer + DSS": f'{100 * metric_value:.2f} %',
            "T10": f'{100 * metric_value_t10:.2f} %',
        }
    else:
        res = {
            "Proposer + DSS": f'{metric_value:.2f}',
            "T10": f'{metric_value_t10:.2f}',
        }
    if is_categorical:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})"
    else:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
예제 #15
0
ctype_type = pd.crosstab(index=train_cat_df["Cover_Type"], 
                            columns=train_cat_df["Wilderness_Area1"])
ctype_type


# In[26]:


stats.chi2_contingency(ctype_type)


# In[27]:


table, results = rp.crosstab(train_cat_df["Cover_Type"], train_cat_df["Wilderness_Area1"], prop= 'col', test= 'chi-square')
table


# In[28]:


print(results)


# In[29]:


for col in train_cat_df.columns[:-1]:
    table, results = rp.crosstab(train_cat_df["Cover_Type"], train_cat_df[col], prop= 'col', test= 'chi-square')
    if(results.results[1]<0.05):
def cramer(df):
    table, results = rp.crosstab(df['image'],
                                 df['type'],
                                 prop='col',
                                 test='chi-square')
예제 #17
0
        else:
            return ("N")


df_train["Age_above65_female"] = df_train[["Age", "Sex"]].apply(age_female,
                                                                axis=1)
print("變數_Ageabove65_female\n", df_train)

# Case03_透過昨天課程的內容,驗證產生的兩個新變數,哪一個和目標變數(Survived_cate) 的相關性較高?
# Survived_cate 和 兩變數(Age_above65、Age_above65_female)關係都是離散vs離散,因此採用Cramer’s V 係數
le = preprocessing.LabelEncoder()
df_train['Age_above65'] = le.fit_transform(df_train['Age_above65'])
above65_table = pd.crosstab(df_train["Age_above65"], df_train['Survived_cate'])
df_above65_table = min(above65_table.shape[0], above65_table.shape[1]) - 1
above65_table, res_above65 = researchpy.crosstab(df_train["Age_above65"],
                                                 df_train['Survived_cate'],
                                                 test='chi-square')

df_train['Age_above65_female'] = le.fit_transform(
    df_train['Age_above65_female'])
above65_female_table = pd.crosstab(df_train["Age_above65_female"],
                                   df_train['Survived_cate'])
df_above65_female_table = min(above65_female_table.shape[0],
                              above65_female_table.shape[1]) - 1
above65_female_table, res_above65_female = researchpy.crosstab(
    df_train["Age_above65_female"],
    df_train['Survived_cate'],
    test='chi-square')

res = {}
res["Survived_vs_Age_above65"] = [
예제 #18
0
from itertools import permutations
import researchpy

cols = df.columns[:19]
perm = permutations(cols, 2)
outP = {"indeX": list(), "colS": list(), "valueS": list()}
ii = 0
for hh in list(perm)[67:]:
    col1 = mm[hh[0]]
    col2 = mm[hh[1]]
    if mm.dtypes[hh[0]].name == "float64":
        col1 = pd.qcut(col1, 5)
    if mm.dtypes[hh[1]].name == "float64":
        col2 = pd.qcut(col2, 5)

    a, b = researchpy.crosstab(col1, col2, test="chi-square")
    outP["valueS"].append(b.iloc[2, 1])
    outP["indeX"].append(hh[0])
    outP["colS"].append(hh[1])
    ii += 1
    print(ii)

rouP = pd.DataFrame(outP)
mmK = rouP.pivot_table(index="indeX", columns="colS", values="valueS")
mmK.fillna(1, inplace=True)

mmK.to_pickle("heatmap.pkl")

fig = plt.figure(figsize=(10, 10))
sns.heatmap(mmK, cmap="BuPu")
plt.savefig("heatmap_1.png")
예제 #19
0
def _get_prop_vs_prop_dss_score(treatment, con, dfs=None, use_percentage=None, use_labels=None, metric=None, as_percentage=None, is_categorical=None, alternative=None):
    df_prop, df_resp = get_prop_resp(treatment)

    prop_values = metric(df_resp["min_offer"], df_prop["offer"])
    prop_value = metrics.get_mean(prop_values)

    prop_dss_values = metric(df_resp["min_offer"], df_prop["offer_dss"])
    prop_dss_value = metrics.get_mean(prop_dss_values)

    auto_dss_values = metric(df_resp["min_offer"], df_prop["ai_offer"])
    auto_dss_value = metrics.get_mean(auto_dss_values)

    dof = 0
    diff = None

    print(metric)
    if is_categorical:
        table = pd.crosstab(prop_values, prop_dss_values)
        # print("TABLE: ", table)
        # checked using: http://vassarstats.net/propcorr.html
        # s, p = sms2.mcnemar(prop_values, prop_dss_values, exact=False, correction=False)
        table, res = rp.crosstab(prop_values, prop_dss_values, test='mcnemar')
        #chi, p, s = (res.results.values)
        s, p, r = (res.results.values)
        
        print("Conclusion: ", generate_stat_sentence(np.mean(prop_values), np.std(prop_values), np.mean(prop_dss_values), np.std(prop_dss_values), s, p, dof, diff=diff, label1=treatment, label2=treatment+".dss"))
        test_label = f"(mcnemar - chi2)"
    else:
        s, p =  stats.wilcoxon(prop_values, prop_dss_values, alternative=alternative or 'two-sided')


        table, res = rp.ttest(pd.Series(prop_values), pd.Series(prop_dss_values), paired=True)
        #res = rp.ttest(pd.Series(prop_values), pd.Series(prop_dss_values), paired=True)
        diff = res.results[0] 
        dof = res.results[1]
        s = res.results[2]
        p = res.results[3]
        r = res.results[9]

        test_label = f"(ttest dependent)"

        print("Conclusion: ", generate_stat_sentence(np.mean(prop_values), np.std(prop_values), np.mean(prop_dss_values), np.std(prop_dss_values), s, p, dof, diff=diff, label1=treatment, label2=treatment+".dss"))

    
    print("TABLE:", table)
    print("RES:",  res)
    if as_percentage:
        res = {
            "Proposer": f'{100 * prop_value:.2f} %',
            "Proposer + DSS": f'{100 * prop_dss_value:.2f} %',
            # "prop:dss - prop": f'{100 * (prop_dss_value - prop_value):.2f} %',
        }
    else:
        res = {
            "Proposer": f'{prop_value:.2f}',
            "Proposer + DSS": f'{prop_dss_value:.2f}',
            # "prop:dss - prop": f'{(prop_dss_value - prop_value):.2f} %',
        }
    if is_categorical:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})"
    else:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
예제 #20
0
 def mcnemar(self, df):
     """
     This function performs the mcnemar test and returns a df with the results
     """
     table, res = rp.crosstab(df['perm'], df['t_test'], test='mcnemar')
     return res
예제 #21
0

dv = "Age"
between = "Survived_cate"
aov = pg.anova(dv=dv, between=between, data=df_train, detailed=True)
etaSq = aov.SS[0] / (aov.SS[0] + aov.SS[1])
print("Q1: 透過數值法計算 Age 和 Survived 是否有相關性? A:連續與離散")
print("Eta Squared (η2)結果:%.3f 相關性 %s" % (etaSq, valiate_etaSq(etaSq)))
# print("Cramer's Values 結果 ", res.loc[2, 'results'], valiate_etaSq(res.loc[2, 'results']))
dt = 'Survived_cate'
between = 'Sex'
# step1: 用交叉列連表(contingency table),來整理兩個類別型的資料
contTable = pd.crosstab(df_train[between], df_train[dt])
df = min(contTable.shape[0], contTable.shape[1]) - 1
crosstab, res = researchpy.crosstab(df_train[between],
                                    df_train[dt],
                                    test='chi-square')
print()
print("Q2:透過數值法計算 Sex 和 Survived 是否有相關性? A:離散與離散")
print("Cramer's 相關性%.3f 結果 %s" %
      (res.loc[2, 'results'], judgment_CramerV(df, res.loc[2, 'results'])))
print()


# 連續與連續 Pearson
def judgment_PearsonV(corr):
    if corr < .1:
        qual = "無線性相關"
    elif corr < .4:
        qual = "低度線性相關"
    elif corr < .7:
예제 #22
0
McNemar's Test
"""
xls = pd.ExcelFile("formcnemar18.xlsx")
sports = xls.sheet_names
df = {sport: xls.parse(sport) for sport in sports}
rankings = df["NHL"].columns.tolist()[-15:-1]

# mcnemar by league
mcnemar_league = {}
for sport in sports:
    mcnemar_league[sport] = np.ones([len(rankings), len(rankings), 2])
    for i, r1 in enumerate(rankings):
        for j, r2 in enumerate(rankings):
            if r1 != r2:
                _, res = rp.crosstab(df[sport][r1],
                                     df[sport][r2],
                                     test="mcnemar")
                chisq, p, _ = res.results.values
                mcnemar_league[sport][i, j] = np.array([chisq, p])

# write to excel
writer = pd.ExcelWriter('mcnemar_league.xlsx', engine='xlsxwriter')
# define sheet names
chisq_sheets = [sport + "_chisq" for sport in sports]
p_sheets = [sport + "_p-value" for sport in sports]

# loop for each sport
for sport, chisq_sheet, p_sheet in zip(sports, chisq_sheets, p_sheets):
    # chi_sq
    chisq_df = pd.DataFrame(mcnemar_league[sport][:, :, 0],
                            index=rankings,
예제 #23
0
def _get_prop_vs_prop_dss_score(treatment,
                                con,
                                dfs=None,
                                use_percentage=None,
                                use_labels=None,
                                metric=None,
                                as_percentage=None,
                                is_categorical=None,
                                alternative=None):
    df_prop, df_resp = get_prop_resp(treatment)
    df_prop_t20, df_resp_t20 = get_prop_resp("t20a")

    # prop_values = metric(df_resp["min_offer_dss"], df_prop["offer"])
    # prop_value = metrics.get_mean(prop_values)

    prop_dss_values = metric(df_resp["min_offer_dss"], df_prop["offer_dss"])
    prop_dss_value = metrics.get_mean(prop_dss_values)

    auto_dss_values = metric(df_resp_t20["min_offer_dss"],
                             df_prop_t20["ai_offer"])
    auto_dss_value = metrics.get_mean(auto_dss_values)

    dof = 0
    diff = None

    if is_categorical:
        # table = np.array([np.bincount(prop_values), np.bincount(prop_dss_values)])
        # print("TABLE: ", table)
        # checked using: http://vassarstats.net/propcorr.html
        # s, p = sms2.mcnemar(prop_values, prop_dss_values, exact=False, correction=False)
        table, res = rp.crosstab(prop_dss_values,
                                 auto_dss_values,
                                 test='mcnemar')
        s, p, r = res.results.values

        test_label = f"(mcnemar) H0: equal, Ha: {'two-sided'}"

        print(
            "Conclusion: ",
            generate_cat_stat_sentence(np.mean(prop_dss_values),
                                       np.std(prop_dss_values),
                                       np.mean(auto_dss_values),
                                       np.std(auto_dss_values),
                                       s,
                                       p,
                                       dof,
                                       diff=diff,
                                       label1=treatment + ".dss",
                                       label2="t20.dss"))
    else:
        #s, p =  stats.wilcoxon(prop_values, auto_dss_values, alternative=alternative or 'two-sided')

        table, res = rp.ttest(pd.Series(prop_dss_values),
                              pd.Series(auto_dss_values),
                              paired=False)

        test_label = f"(wilcoxon) H0: equal, Ha: {alternative or 'two-sided'}"
        diff = res.results[0]
        dof = res.results[1]
        s = res.results[2]
        p = res.results[3]
        r = res.results[9]

        print(
            "Conclusion: ",
            generate_stat_sentence(np.mean(prop_dss_values),
                                   np.std(prop_dss_values),
                                   np.mean(auto_dss_values),
                                   np.std(auto_dss_values),
                                   s,
                                   p,
                                   dof,
                                   diff=diff,
                                   label1=treatment + ".dss",
                                   label2="t20.dss"))

    if as_percentage:
        res = {
            # "Proposer": f'{100 * prop_value:.2f} %',
            "Proposer + DSS":
            f'{100 * prop_dss_value:.2f} %',
            "T20 Auto DSS":
            f'{100 * auto_dss_value:.2f} %',
            "prop:dss - auto prop":
            f'{100 * (prop_dss_value - auto_dss_value):.2f} %',
        }
    else:
        res = {
            # "Proposer": f'{prop_value:.2f}',
            "Proposer + DSS": f'{prop_dss_value:.2f}',
            "T20 Auto DSS": f'{auto_dss_value:.2f}',
            "prop:dss - auto prop":
            f'{(prop_dss_value - auto_dss_value):.2f} %',
        }
    if is_categorical:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})"
    else:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
예제 #24
0
def get_info_accuracy(treatment, con, dfs=None, use_percentage=None, use_labels=None):
    if treatment in ("t13a", "t13"):
        ref = "t12a"
    elif treatment in ("t11a", "t11b"):
        ref = "t10b"
    else:
        ref = treatment

    df_prop, df_resp = get_prop_resp(treatment)
    df_prop_ref, df_resp_ref = get_prop_resp(ref)

    if SELECTION == "prop":
        values = df_prop["feedback_accuracy"]
        values_ref = df_prop_ref["feedback_accuracy"]
    else:
        values = df_resp["feedback_fairness"]
        values_ref = df_resp_ref["feedback_fairness"]

    # feedback_fairness

    values_ref = values_ref.apply(lambda x: AI_FEEDBACK_ACCURACY_SCALAS_REV.get(x, x))
    values = values.apply(lambda x: AI_FEEDBACK_ACCURACY_SCALAS_REV.get(x, x))

    
    # print("DIFF: ", values, values_ref)
    # resp_values = metrics.get_data(metrics.get_rel_min_offer_df(df_resp))
    # resp_ref_values = metrics.get_data(metrics.get_rel_min_offer_df(df_resp_ref))

    values 

    print("MEDIAN: ", values.median(), values_ref.median())
    dof = 0
    diff = 0
    table, res = rp.crosstab(pd.Series(values), pd.Series(values_ref), test='g-test')
    s, p, r = res.results.values
    # s = res.results[2]
    # p = res.results[3]
    # r = res.results[9]
    # diff = res.results[0] 
    # dof = res.results[1]
    # s = res.results[2]
    # p = res.results[3]
    # r = res.results[9]

    tmp_res = None
    tmp_res = stats.mannwhitneyu(values, values_ref, use_continuity=False)
    # tmp_res = stats.ranksums(values, values_ref)
    print("TMP values: ", tmp_res)
    
    print("Conclusion: ", generate_stat_sentence(np.mean(values_ref), np.std(values_ref), np.mean(values), np.std(values), s, p, dof, diff=diff, label1="t12.dss",  label2=treatment+".dss"))


    print("Table:", table)        
    print("Res:", res)

    res = {
        "rel. min_offer T12": metrics.get_mean(values_ref),
        "rel. min_offer T13": metrics.get_mean(values),

        # "rejection_ratio": rejection_ratio(df_prop)
        }
    test_label = f"(ttest independent) H0: equal"
    res = {k: (f"{v:.3f}" if pd.notnull(v) and v!= int(v) else v) for k,v in res.items()}
    res["min_offer" + test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
    statDF['ShowCommitmentPercent'] = statDF.apply(
        lambda x: 1 - x['avoidCommitPercent'], axis=1)

    statDF = statDF.reset_index()
    statDF['participantsType'] = [
        'RL Agent' if 'max' in name else 'Human' for name in statDF['name']
    ]

    # statDF['sem'] = df.groupby(['participantsType', 'decisionSteps'])["avoidCommitPercent"].apply(calculateSE)

    statDF = statDF[statDF['participantsType'] == 'Human']
    # statDF = statDF[statDF['participantsType'] == 'RL Agent']

    crosstab, res = researchpy.crosstab(dfExpTrail['hasAvoidPoint'],
                                        dfExpTrail['decisionSteps'],
                                        test="chi-square")

    print(crosstab)

    # Compute the two-way mixed-design ANOVA
    calAnova = 0
    if calAnova:
        import pingouin as pg
        aov = pg.mixed_anova(dv='ShowCommitmentPercent',
                             within='decisionSteps',
                             between='participantsType',
                             subject='name',
                             data=statDF)
        pg.print_table(aov)
예제 #26
0
def _get_prop_vs_prop_dss_score(treatment,
                                con,
                                dfs=None,
                                use_percentage=None,
                                use_labels=None,
                                metric=None,
                                as_percentage=None,
                                is_categorical=None,
                                alternative=None):
    df_prop, df_resp = get_prop_resp(treatment)
    df_prop_ref, df_resp_ref = get_prop_resp("t11a")

    print(metric.__name__)

    metric_values = metric(df_prop)
    metric_value = metrics.get_mean(metric_values)

    metric_ref_values = metric(df_prop_ref)
    metric_value_ref = metrics.get_mean(metric_ref_values)

    metric_values = metrics.get_data(metric_values)
    metric_ref_values = metrics.get_data(metric_ref_values)

    dof = 0
    diff = None
    if is_categorical:
        table, res = rp.crosstab(pd.Series(metric_ref_values),
                                 pd.Series(metric_values),
                                 test='chi-square')
        s, p, r = res.results.values

        print(
            "Conclusion: ",
            generate_cat_stat_sentence(np.mean(metric_ref_values),
                                       np.std(metric_ref_values),
                                       np.mean(metric_values),
                                       np.std(metric_values),
                                       s,
                                       p,
                                       dof,
                                       diff=diff,
                                       label1="t11a.dss",
                                       label2=treatment + ".dss"))
        test_label = f"(pearson chi2)"
    else:

        #print("Ranksums", stats.ranksums(metric_ref_values, metric_values))

        table, res = rp.ttest(pd.Series(metric_ref_values),
                              pd.Series(metric_values),
                              paired=False)
        s = res.results[2]
        if alternative == "greater":
            p = res.results[4]
        elif alternative == "less":
            p = res.results[5]
        elif alternative in (None, 'two-sided'):
            p = res.results[3]
        r = res.results[9]
        diff = res.results[0]
        dof = res.results[1]
        s = res.results[2]
        p = res.results[3]
        r = res.results[9]

        print(
            "Conclusion: ",
            generate_stat_sentence(np.mean(metric_ref_values),
                                   np.std(metric_ref_values),
                                   np.mean(metric_values),
                                   np.std(metric_values),
                                   s,
                                   p,
                                   dof,
                                   diff=diff,
                                   label1="t11a.dss",
                                   label2=treatment + ".dss"))

        test_label = f"(ttest independent) H0: {'equal' if alternative in {None, 'two-sided'} else alternative}"
    print("RESUME: ", res)
    print("TABLE: ", table)

    if as_percentage:
        res = {
            # "Proposer": f'{100 * prop_value:.2f} %',
            "Proposer + DSS":
            f'{100 * metric_value:.2f} %',
            "T11A ":
            f'{100 * metric_value_ref:.2f} %',
            "prop:dss - auto prop":
            f'{100 * (metric_value - metric_value_ref):.2f} %',
        }
    else:
        res = {
            # "Proposer": f'{prop_value:.2f}',
            "Proposer + DSS": f'{metric_value:.2f}',
            "T11A": f'{metric_value_ref:.2f}',
            "prop:dss - auto prop":
            f'{(metric_value - metric_value_ref):.2f} %',
        }
    res[test_label] = f"{s:.3f} ({p:.3f})"
    if is_categorical:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, phi: {r:.3f})"
    else:
        res[test_label] = f"{s:.3f} (p: {p:.3f}, r: {r:.3f})"
    return res
예제 #27
0
plt.ylabel('PDF(IMDB_Class Rating)')
plt_params_x = imdb_db.class_rating.value_counts().index
plt_params_y = imdb_db.class_rating.value_counts().values
sb.barplot(plt_params_x, plt_params_y)
imdb_db.class_rating.value_counts()

# In[35]:

# sb.barplot(imdb_db.class_rating.values, hue='color', data=imdb_db)

# In[36]:

import researchpy as rp
from scipy import stats
table, results, expected = rp.crosstab(imdb_db['color'],
                                       imdb_db['class_rating'],
                                       test='chi-square',
                                       expected_freqs=True)
table, results
#Just to verify with Chi-square test of independance if the categorical values 'color' and 'black and white' are independance.
#p-value of <=0.05 indicates, there is indeed independance. So we keep the attribute 'color' and move ahead

# In[37]:

imdb_db.columns

# In[38]:

# import researchpy as rp
# from scipy import stats
# table, results, expected = rp.crosstab(imdb_db['aspect_ratio'], imdb_db['class_rating'], test= 'chi-square', expected_freqs=True)
# table , results
예제 #28
0
        qual = 'Small'
    elif etaSq < .14:
        qual = 'Medium'
    else:
        qual = 'Large'
    return (qual)


print(" Age 和 Survived 是否有相關性?", judgment_etaSq(etaSq))

# Case02_透過數值法計算 Sex 和 Survived_cate 是否有相關性?
contTable = pd.crosstab(df_train['Sex'], df_train['Survived_cate'])
print(contTable)
df = min(contTable.shape[0], contTable.shape[1]) - 1
crosstab, res = researchpy.crosstab(df_train['Sex'],
                                    df_train['Survived_cate'],
                                    test='chi-square')
print("Cramer's value is", res.loc[2, 'results'])


## 寫一個副程式判斷相關性的強度
def judgment_CramerV(df, V):
    if df == 1:
        if V < 0.10:
            qual = 'negligible'
        elif V < 0.30:
            qual = 'small'
        elif V < 0.50:
            qual = 'medium'
        else:
            qual = 'large'