def twoway_anova(dv, iv1, iv2, dataset): res = "" formula = '{0} ~ {1} + {2} + {1}:{2}'.format(dv, iv1, iv2) regression = ols(formula, data=dataset).fit() anova_table = sm.stats.anova_lm(regression, typ=2) p_1 = anova_table['PR(>F)'][0] p_2 = anova_table['PR(>F)'][1] p_total = anova_table['PR(>F)'][2] res += str(anova_table) etaSquared_1 = anova_table['sum_sq'][0] / (anova_table['sum_sq'][0] + anova_table['sum_sq'][3]) etaSquared_2 = anova_table['sum_sq'][1] / (anova_table['sum_sq'][1] + anova_table['sum_sq'][3]) etaSquared_total = anova_table['sum_sq'][2] / ( anova_table['sum_sq'][2] + anova_table['sum_sq'][3]) res += "\nWe can explain {0:.3f}% of {1} difference due to difference in {2}\n".format( etaSquared_1 * 100, dv, iv1) res += "\nWe can explain {0:.3f}% of {1} difference due to difference in {2}\n".format( etaSquared_2 * 100, dv, iv2) res += "\nWe can explain {0:.3f}% of {1} difference due to difference in {2} and {3}\n\n".format( etaSquared_total * 100, dv, iv1, iv2) if p_1 >= 0.05: res += "\nThe is no significant difference in {0} between different categories of {1}".format( dv, iv1) else: res += "\nThere is significant difference in {0} between different categories of {1}".format( dv, iv1) if p_2 >= 0.05: res += "\nThe is no significant difference in {0} between different categories of {1}".format( dv, iv2) else: res += "\nThere is significant difference in {0} between different categories of {1}".format( dv, iv2) if p_total >= 0.05: res += "\nThe is no significant difference in {0} between different categories of {1} and {2}".format( dv, iv1, iv2) else: res += "\nThere is significant difference in {0} between different categories of {1} and {2}".format( dv, iv1, iv2) res += "\nThere are at least two groups different. which two? :\n\n" res += "\nPost-Hoc Test:-----------------------\n" tukey_1 = pairwise_tukeyhsd(endog=dataset[dv], groups=dataset[iv1], alpha=0.05) tukey_2 = pairwise_tukeyhsd(endog=dataset[dv], groups=dataset[iv2], alpha=0.05) res += str(tukey_1.summary()) res += "\n-------------------------------------\n" res += str(tukey_2.summary()) return res
def compare_sentiment_means(data, field, covar=None, standardization="min-max"): from statsmodels.stats import anova from statsmodels.formula.api import ols from statsmodels.stats.multicomp import pairwise_tukeyhsd from scipy.stats.mstats import zscore colnames = [name for name in data.columns if field+'_' in name and not '_err' in name] if standardization=="min-max": data_std = ((data[colnames] - data[colnames].min()) / (data[colnames].max() - data[colnames].min()))*2-1 elif standardization == "z-score": data_std = data.copy() for col in colnames: data_std[col] = zscore(data[col]) else: data_std = data if 'ID' in data.columns: # for normal data data_std['ID'] = data['ID'] else: # for aggregated data, create placeholder IDs data_std['ID'] = [i for i in range(len(data_std))] data_std['%s_recessie' %field] = data['%s_recessie' %field] if covar: data_std[covar] = data[covar] if not covar: print(data_std.describe()) melted = data_std.melt(id_vars=['ID']) else: print(data_std.groupby(covar).describe()) melted = data_std.melt(id_vars=['ID',covar]) melted = melted.dropna() if covar: formula = "value ~ variable*%s" %covar else: formula = "value ~ variable" aov = anova.anova_single(ols(formula, melted).fit()) print() print("## ANOVA results for differences in the mean of {field} sentiment".format(field=field)) print() print(aov) if covar: melted['subgroups'] = ['%s_%s' %(col,covar) for col, covar in zip(melted['variable'],melted[covar])] posthoc = pairwise_tukeyhsd(melted['value'],melted['subgroups']) else: posthoc = pairwise_tukeyhsd(melted['value'], melted['variable']) print() print("## Tukey HSD post-hoc tests for mean differences in {field} sentiment".format(field=field)) print() print(posthoc.summary()) return data_std
def pairwise_tukeyhsd5(list1, list2, list3, list4, list5, filename): list1 = np.array(list1).T[2] list2 = np.array(list2).T[2] list3 = np.array(list3).T[2] list4 = np.array(list4).T[2] list5 = np.array(list5).T[2] list_all = [] for value in list1: list_all.append(['10', float(value)]) for value in list2: list_all.append(['20', float(value)]) for value in list3: list_all.append(['40', float(value)]) for value in list4: list_all.append(['80', float(value)]) for value in list5: list_all.append(['inf.', float(value)]) ave1 = np.average(list1) ave2 = np.average(list2) ave3 = np.average(list3) ave4 = np.average(list4) ave5 = np.average(list5) std1 = np.std(list1) std2 = np.std(list2) std3 = np.std(list3) std4 = np.std(list4) std5 = np.std(list5) print "---------------------------------------------------------------" print " data average std " print "---------------------------------------------------------------" print " list1 " + str(ave1) + " " + str( std1) + " " print " list2 " + str(ave2) + " " + str( std2) + " " print " list3 " + str(ave3) + " " + str( std3) + " " print " list4 " + str(ave4) + " " + str( std4) + " " print " list5 " + str(ave5) + " " + str( std5) + " " print "---------------------------------------------------------------\n" list_all = np.rec.array(list_all, dtype=[('carv', '|S4'), ('value', float)]) print pairwise_tukeyhsd(list_all['value'], list_all['carv']) with open(filename, 'w') as write_file: write_file.write("1 " + str(ave1) + " " + str(std1) + "\n") write_file.write("2 " + str(ave2) + " " + str(std2) + "\n") write_file.write("3 " + str(ave3) + " " + str(std3) + "\n") write_file.write("4 " + str(ave4) + " " + str(std4) + "\n") write_file.write("5 " + str(ave5) + " " + str(std5) + "\n")
def doTukey(data: np.ndarray, multiComp: MultiComparison) -> None: """Do a pairwise comparison, and show the confidence intervals Parameters ---------- data : structured array, containing the input data multComp : Result of the 'MultiComparison'-test """ # Show the results of the multicomparison test print((multiComp.tukeyhsd().summary())) # Calculate the p-values: res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment']) df = pd.DataFrame(data) numData = len(df) numTreatments = len(df.Treatment.unique()) dof = numData - numTreatments # Show the group names print((multiComp.groupsunique)) # Generate a print ------------------- # Get the data xvals = np.arange(3) res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment']) errors = np.ravel(np.diff(res2.confint)/2) # Plot them plt.plot(xvals, res2.meandiffs, 'o') plt.errorbar(xvals, res2.meandiffs, yerr=errors, fmt='o') # Put on labels pair_labels = \ multiComp.groupsunique[np.column_stack(res2._multicomp.pairindices)] pairs = [':'.join(labels) for labels in pair_labels] plt.xticks(xvals, pairs) # Format the plot xlim = -0.5, 2.5 plt.hlines(0, *xlim) plt.xlim(*xlim) plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' + '\n Pairwise Mean Differences') # Save to outfile, and show the data outFile = 'multComp.png' showData(outFile)
def TukeyHelper(crime_data, crimes, feature, alpha=0.05): df = crime_data.loc[crime_data['TYPE'].isin(crimes)] # The distance data is all right skewed as most crimes are often somewhat # near city features, so we sqrt them to get normal looking data df['nearest_' + feature] = np.sqrt(df['nearest_' + feature]) # Plot the histogram of the data to confirm that it is mostly normal groups = df.groupby('TYPE') for crime in crimes: plt.hist(groups.get_group(crime)['nearest_' + feature], alpha=0.5, label=crime) plt.legend(loc=1) plt.show() # Run Levene test on each crime distance distribution # https://stackoverflow.com/questions/26202930/pandas-how-to-apply-scipy-stats-test-on-a-groupby-object values_per_group = [ np.sqrt(col) for col_name, col in groups['nearest_' + feature] ] print('P-value for Levene test:', levene(*values_per_group).pvalue) # Finally run the Tukey test, with the above data confirming / denying # the test's validity posthoc = pairwise_tukeyhsd(df['nearest_' + feature], df['TYPE'], alpha=alpha) print(posthoc)
def tukey_hsd(df, val_col, group_col): #dataを検定に使えるように整形 keys = df.groupby(group_col).groups.keys() val_data = [] group_data = [] for key in keys: d = df.loc[df.groupby(group_col).groups[key], val_col] val_data.append(d) group_data.append([key] * len(d)) result = sm.pairwise_tukeyhsd(np.concatenate(val_data), np.concatenate(group_data)) #結果をDataFrameに変換 groups = np.array(result.groupsunique, dtype=np.str) groups_len = len(groups) vs = pd.DataFrame( np.zeros(groups_len * groups_len).reshape(groups_len, groups_len)) for a in result.summary()[1:]: a0 = str(a[0]) a1 = str(a[1]) a0i = np.where(groups == a0)[0][0] a1i = np.where(groups == a1)[0][0] vs[a0i].loc[a1i] = a[3].data vs[a1i].loc[a0i] = a[3].data vs.index, vs.columns = groups, groups return vs
def multiple_test(data, X, Y, alpha=0.05): """ :param data: 输入dataframe格式数据 :return: 输出列表结果,第一行是列头,其他行是数据 """ log.info('---------------------multiple test------------------') # list_total = data.iloc[:, -1] # 这里需要注意一下 # combination = Combination(list_levels) # for pair in combination: # LSD(list_levels, list_total, pair[0], pair[1]) alpha_range = (1 - alpha) * 100 res = pairwise_tukeyhsd(data[Y[0]], data[X[0]], alpha=alpha) res_summary = res.summary().data for r in range(1, len(res_summary)): res_summary[r][-1] = str(res_summary[r][-1]) col_map = { "group1": "组1", "group2": "组2", "meandiff": "平均值差值2-1", "p-adj": "显著性", "lower": "{}%置信区间下限".format(alpha_range), "upper": "{}%置信区间上限".format(alpha_range), "reject": "拒绝原假设", } return { "col": [col_map[c] for c in res_summary[0]], "data": res_summary[1:], "title": "多重比较", "remarks": "注:多重比较方法基于Tukey HSD。拒绝原假设,False表示不拒绝原假设,True表示拒绝原假设。" }
def tukey_pairwise_test(self, feat, group): """ Perform pairwise test on image roughness features Parameters: ----------- feat: str A string that represents a colour feature group: str A string that specifies which rock type tier used in pairwise test; If group="Type", individual such as HEM will be used in test; If group="CombinedType", tier I rock type such as ORE will be used in test Returns: -------- stat_tbl: pd.DataFrame A table that store the result of tukey pairwise test """ if not isinstance(feat, str): raise TypeError("feat should be of string type") if feat not in COLOUR_FEATS: raise ValueError( "feat should be chosen from 'SkewnessBlue', KurtosisBlue','MeanPixelBlue',\ 'SkewnessGreen','KurtosisGreen','MeanPixelGreen', 'SkewnessRed','KurtosisRed','MeanPixelRed'" ) tukey = pairwise_tukeyhsd( endog=self.rtbl[feat], # Data groups=self.rtbl[group], # Groups alpha=0.05) # Significance level summary = tukey.summary() stat_tbl = pd.DataFrame(summary.data[1:], columns=summary.data[0]) return stat_tbl
def oneway_anova(df, x, y, W, H, use_hsd=True, plot=True): # mean_compare_table mean_compare_table = df.groupby(x, as_index=False)[[y]].mean() print(mean_compare_table) if plot: # plot plt.figure(figsize=(W, H)) sns.violinplot(x, y, data=df) # set group val_list = list(set(df[x])) groups = [] for val in val_list: groups.append(df.loc[df[x] == val, y].tolist()) # anova levene_test = levene(*groups) if levene_test.pvalue >= 0.05: print("方差齐") f_value, p_value = f_oneway(*groups) else: print("方差不齐") f_value, p_value = f_oneway(*groups) # 实际都使用f_oneway #h_value, p_value = kruskalwallis(*groups) # 结论 print(p_value) if use_hsd: hsd = pairwise_tukeyhsd(endog=df[y], groups=df[x], alpha=0.05) print(hsd.summary()) return mean_compare_table
def TukeyHSD(self, listPops, confidence = 0.95): """ Post-hoc Tukey HSD test. """ if type(listPops) != type([]): listPops = [listPops] ## Gathering all data required. Paux = np.zeros((0,2)) popid = 0 for pop in listPops: p = np.array( self.getFluorescence(pop) ) data1 = np.zeros( (p.shape[0]*p.shape[1],2) ) data1[:,0] = np.reshape( p, (p.shape[0]*p.shape[1]) ) data1[:,1] = popid popid += 1 Paux = np.concatenate([Paux, data1], axis=0) ## Evaluating Tukey's HSD tukey_res = pairwise_tukeyhsd(Paux[:,0], Paux[:,1], alpha= 1 - confidence) ## Creating a dataframe with the results Groups = np.array( list( itertools.combinations(listPops, 2) ) ) data = {'Group 1' : Groups[:,0], 'Group 2': Groups[:,1], 'Mean diff': tukey_res.meandiffs, 'Reject H0?' : tukey_res.reject} df = pd.DataFrame(data=data) return df
def oneway_anova(dv, iv, dataset): res = "" formula = '{0} ~ {1}'.format(dv, iv) regression = ols(formula, data=dataset).fit() anova_table = sm.stats.anova_lm(regression, typ=2) p = anova_table['PR(>F)'][0] res += str(anova_table) etaSquared = anova_table['sum_sq'][0] / (anova_table['sum_sq'][0] + anova_table['sum_sq'][1]) res += "\nWe can explain {0:.3f}% of {1} difference due to difference in {2}\n\n".format( etaSquared * 100, dv, iv) if p >= 0.05: res += "\nThe is no significant difference in {0} between different categories of {1} ".format( dv, iv) else: res += "There is significant difference in {0} between different categories of {1}".format( dv, iv) res += "\nThere are at least two groups different. which two? :\n\n" res += "\nPost-Hoc Test:-----------------------\n" tukey = pairwise_tukeyhsd(endog=dataset[dv], groups=dataset[iv], alpha=0.05) res += str(tukey.summary()) return res
def get_dist_df(self, svddf): CONSTS = DistMetricCalculator.DataFrameCols.DistDF SIGMA = svddf[CONSTS.SIGMA][0] series = svddf.drop( [DistMetricCalculator.DataFrameCols.CHANNEL, CONSTS.SIGMA], axis=1) s = pairwise_tukeyhsd( series, svddf[DistMetricCalculator.DataFrameCols.CHANNEL]) df = pd.DataFrame( s._results_table.data, columns=DistMetricCalculator.DataFrameCols.DistDF.ALL_COLS).iloc[ 1:, :] df[CONSTS.G1] = df[CONSTS.G1].astype("category") df[CONSTS.G2] = df[CONSTS.G2].astype("category") df[CONSTS.REJECT] = df[CONSTS.REJECT].astype(str).str.strip( " ") == "True" idx = df[CONSTS.REJECT] == False df.loc[idx, CONSTS.MEAN_DIFF] = 0 distfinaldf = df.groupby([ CONSTS.G1, CONSTS.G2 ]).apply(lambda df: np.abs(df[CONSTS.MEAN_DIFF]).mean() * SIGMA) distfinaldf = distfinaldf.reset_index(name=CONSTS.DIST) return distfinaldf
def test_continuous_feat(self, cluster_dfs, categories): import statsmodels.api as sm from statsmodels.formula.api import ols from statsmodels.stats.multicomp import pairwise_tukeyhsd import pandas as pd for j in categories: for a, i in enumerate(cluster_dfs.keys()): if a == 0: int_df = pd.DataFrame(cluster_dfs[i][j]) int_df.columns = [i] else: temp = pd.DataFrame(cluster_dfs[i][j]) temp.columns = [i] int_df = int_df.join(temp) int_df_unpiv = int_df.melt().dropna() int_df_unpiv.columns = ['cluster', 'value'] mod = ols('value ~ cluster', data=int_df_unpiv).fit() aov_table = sm.stats.anova_lm(mod, typ=2) print('\n \n', j) print(aov_table, '\n') print( pairwise_tukeyhsd(int_df_unpiv['value'], int_df_unpiv['cluster'])) self.stats_table = int_df
def test_tukeyRangeTest_pResult(self): x1, x2, x3 = [1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15] results = tukey_range_test(x1, x2, x3) model = pairwise_tukeyhsd(x1 + x2 + x3, groups=[0] * 5 + [1] * 5 + [2] * 5) p_vals = psturng(np.abs(model.meandiffs / model.std_pairs), len(model.groupsunique), model.df_total) for i in range(3): assert pytest.approx(p_vals[i]) == results[i][2]
def ANOVA_1Way(dataframe, cat_cols, distr_cols, savepath=None): corrs = pd.DataFrame() for distr_col in distr_cols: for cat_col in cat_cols: values = dataframe[cat_col].dropna().unique().tolist() distr = [ np.array( DCut(dataframe, [cat_col], ['eq'], [value])[distr_col]) for value in values ] fstat, pval = f_oneway(*distr) if pval < 1e6: corrs.at[cat_col, distr_col] = ', '.join( ['F' + ': ' + '{:,.3f}'.format(fstat), r'p: 1e-6']) else: corrs.at[cat_col, distr_col] = ', '.join([ 'F' + ': ' + '{:,.3f}'.format(fstat), r'p: ' + '{:.2e}'.format(pval) ]) posthoc = multi.pairwise_tukeyhsd(dataframe[distr_col], dataframe[cat_col], alpha=.05) posthoc = pd.DataFrame(posthoc._results_table.data[1:], columns=posthoc._results_table.data[0]) if savepath != None: savename = os.path.dirname( savepath ) + '/' + distr_col + '_' + cat_col + '_tukeyhsd.tex' posthoc.to_latex(savename.replace(' ', '_'), index=False, encoding='utf-8') if savepath != None: corrs.to_latex(savepath, encoding='utf-8') return corrs
def posthocTukey(data, gene, sampleList): exp, group = setDataPosthoc(data, gene, sampleList) print('Summary post-hoc test Tukey of ' + gene) tukeyhsd = pairwise_tukeyhsd(endog=exp, groups=group, alpha=0.05) print(tukeyhsd) print() return tukeyhsd
def tukey_hsd(ind, *args): data_arr = np.hstack( args ) ind_arr = np.array([]) for x in range(len(args)): ind_arr = np.append(ind_arr, np.repeat(ind[x], len(args[x]))) print(pairwise_tukeyhsd(data_arr,ind_arr))
def stats_test_for_trend(self, mode, gain, param): lin10 = [ self.lin10[param][j:j + 75] for j in range(0, len(self.lin10[param]) - 75, 75) ] lin20 = [ self.lin20[param][j:j + 75] for j in range(0, len(self.lin20[param]) - 75, 75) ] sqrt10 = [ self.sqrt10[param][j:j + 75] for j in range(0, len(self.sqrt10[param]) - 75, 75) ] sqrt20 = [ self.sqrt20[param][j:j + 75] for j in range(0, len(self.sqrt10[param]) - 75, 75) ] quad10 = [ self.quad10[param][j:j + 75] for j in range(0, len(self.quad10[param]) - 75, 75) ] quad20 = [ self.quad20[param][j:j + 75] for j in range(0, len(self.quad20[param]) - 75, 75) ] # print(lin10) # print(st.shapiro((lin10, sqrt10, quad10))) # print(st.shapiro((lin20, sqrt20, quad20))) datalist = { "lin10": lin10, "lin20": lin20, "sqrt10": sqrt10, "sqrt20": sqrt20, "quad10": quad10, "quad20": quad20 } tgt = datalist[mode + str(gain)] print("shapiro-wilk result, p-value: ", st.shapiro(tgt)[1]) N = len(tgt) data = np.hstack(tgt) group = [[i] * 75 for i in range(9)] group = np.hstack(group) res = st.kruskal(*tgt) print("kruskal result, p-value: ", res) res = pairwise_tukeyhsd(data, group) print(res)
def add_stats(ax, times, names, ylim): to_plot= [] #Find y coordinate to plot at: if ylim is None: y_ = np.max(np.max(np.array(times))) dy = .05(y_ - np.min(np.min(np.array(times)))) else: y_ = ylim[1]- (.025*(ylim[1]-ylim[0])) dy = .05*(ylim[1]-ylim[0]) #Test all 3 distributions for normality: abnorm_cnt = 0 for tt, tm_arr in enumerate(times): print names[tt], len(tm_arr) zscore_tm_arr = ( np.array(tm_arr) - np.mean(np.array(tm_arr)) )/np.std(np.array(tm_arr)) x, p = scipy.stats.kstest(zscore_tm_arr,'norm') if p < 0.05: #Not normal (two-sided test) abnorm_cnt += 1 #Print print 'non-normal: ', names[tt] #If all are normal, use ANOVA + Tukey's HSD if abnorm_cnt == 0: #All normal distributions: F, p = scipy.stats.f_oneway(*times) if p < 0.05: #Passed ANOVA, continue with Tukey's LSD test print 'passed anova!' for i, tm_i in enumerate(times): for j, tm_j in enumerate(times[i+1:]): X = np.hstack(( np.array(tm_i), np.array(tm_j) )) Y = np.hstack(( np.zeros((len(tm_i))), np.ones((len(tm_j))) )) res2 = pairwise_tukeyhsd(X, Y) if res2.reject: to_plot.append([i, j+i+1, res2.reject, 'norm']) print 'sig! ', [i, j+i+1, res2.reject, 'norm'], names[i], names[j+i+1] #If any is not normal, use KW + MW: else: #Use KW test: H, p = scipy.stats.mstats.kruskalwallis(*times) if p < 0.05: #Passed group test, continue w/ Mann Whitney test: for i, tm_i in enumerate(times): for j, tm_j in enumerate(times[i+1:]): u, p_onesided = scipy.stats.mannwhitneyu(tm_i, tm_j) if (p_onesided*2.) < 0.05: to_plot.append([i, j+i+1, p_onesided*2., 'nonnorm']) print 'sig! ', [i, j+i+1, p_onesided*2., 'nonnorm'] if len(to_plot) > 0: for plt_line in to_plot: # ax, x1, x2, y, dy, pvalue ax = plot_sig_line(ax, plt_line[0], plt_line[1], y_, dy, plt_line[2], plt_line[3]) return ax, dy
def getOWANOVAmultiComp(data, labels, verbose=False): tlabels = np.concatenate([[labels[j] for _, y in enumerate(x)] for j, x in enumerate(data)]) res = pairwise_tukeyhsd(np.concatenate(data), tlabels) if verbose: print(res.summary()) return psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total)
def sent_ttest(filepath1, filelist): sent = [] file1 = open(filepath1, 'r') lines = csv.reader(file1) for l in lines: sentence = l[0] + l[1] blob = TextBlob(sentence) sent.append(('news', blob.sentiment.polarity)) df = pd.DataFrame(sent, columns=['group', 'polarity']) include = ['object', 'float', 'int'] #print(filepath1) #print(df.describe(include = include)) sent2 = [] sent3 = [] sent4 = [] sentarray = [sent2, sent3, sent4] group = ['academia', 'companies', 'defense'] counter = 0 for item in filelist: readfile = open(item, 'r') rows = csv.reader(readfile) for row in rows: sentence = row[0] blob = TextBlob(sentence) sentarray[counter].append( (group[counter], blob.sentiment.polarity)) counter += 1 df2 = pd.DataFrame(sent2, columns=['group', 'polarity']) #print(filelist[0]) #print(df2.describe(include = include)) df3 = pd.DataFrame(sent3, columns=['group', 'polarity']) #print(filelist[1]) #print(df3.describe(include = include)) df4 = pd.DataFrame(sent4, columns=['group', 'polarity']) #print(filelist[2]) #print(df4.describe(include = include)) # run anova test for difference of group of means print(scipy.stats.f_oneway(sent, sent2, sent3, sent4)) frames = [df, df2, df3, df4] stacked = pd.concat(frames) MultiComp = pairwise_tukeyhsd(endog=stacked['polarity'], groups=stacked['group'], alpha=0.001) print(MultiComp)
def main(): data = pd.read_csv('data.csv') reshape_data = pd.melt(data) posthoc = pairwise_tukeyhsd(reshape_data['value'], reshape_data['variable']) print(posthoc) posthoc.plot_simultaneous() plt.show()
def createGroups(ds,fieldName): output = pairwise_tukeyhsd(ds[targetField],ds[fieldName]) #output.plot_simultaneous()[0] d = output.summary() d = pd.DataFrame(d.data[1:],columns=d.data[0]) o = getSimillarGroups(d) ds[fieldName]=ds[fieldName].map(o) ds=dummifyField(ds,fieldName) return ds
def OnewayAnova(datas): df = pd.DataFrame(datas) cdir = os.path.dirname(os.path.realpath(__file__)) job1 = df[df['job'] == '화이트칼라']['game_time'] job2 = df[df['job'] == '블루칼라']['game_time'] job3 = df[df['job'] == '학생']['game_time'] job4 = df[df['job'] == '기타']['game_time'] # 정규성 확인 try: j1sp = stats.shapiro(job1)[1] j2sp = stats.shapiro(job2)[1] j3sp = stats.shapiro(job3)[1] j4sp = stats.shapiro(job4)[1] except Exception as e: print('error ', e) return 0 # 시각화 plt.rc('font', family='malgun gothic') #한글깨짐 방지 plt.rcParams['axes.unicode_minus'] = False #마이너스부호 깨짐 방지 explode = (0, 0.1, 0, 0) fig2 = plt.figure() plt.title('직업별 인원 비율', fontsize=14) plt.pie(df.groupby('job').size(), labels=df['job'].unique(), colors=["pink", "coral", "lightblue", "yellowgreen"], autopct='%0.1f%%', explode=explode) fig2 = plt.gcf() fig2.savefig('{}\\static\images\\ftest.png'.format(cdir)) plt.close(fig2) # 등분산성 확인 lev = stats.levene(job1, job2, job3, job4).pvalue print(lev) if j1sp > 0.05 and j2sp > 0.05 and j3sp > 0.05 and j4sp > 0.05: print('정규성 만족') if lev > 0.05: _, pv = stats.f_oneway(job1, job2, job3, job4) print('등분산성 만족 ', pv) turk = pairwise_tukeyhsd(df.game_time, df.job) print('사후검정\n', turk) else: _, pv = stats.f_oneway(job1, job2, job3, job4) pv = stats.levene(job1, job2, job3, job4, center='trimmed').pvalue print('등분산성 불만족') else: _, pv = stats.kruskal(job1, job2, job3, job4) print('정규성 불만족 ', pv) return pv
def PH(self, data): df = data.drop(columns=["Div", "HomeTeam", "AwayTeam", "RESULT"]) for i in df.columns: posthoc = pairwise_tukeyhsd(data.iloc[:, [i]], data["RESULT"], alpha=0.05) plt.figure(figsize=(10, 10)) posthoc.plot_simultaneous() plt.title("{}".format(data.columns[i])) plt.show()
def statistics(scores): melted_scores = pd.melt(scores) posthoc = pairwise_tukeyhsd(melted_scores['value'], melted_scores['variable'], alpha=0.05) # Save post hoc results in a text file file = open('posthoc.txt', 'w') file.write(posthoc.summary().as_text()) file.close()
def tukey_test(num, cat): from statsmodels.stats.multicomp import pairwise_tukeyhsd tukey = pairwise_tukeyhsd( endog=num, # Numerical data groups=cat, # Categorical data alpha=0.05) # Significance level summary = tukey.summary() # See test summary print("Tukey's Test Result between " + num.name, ' & ' + cat.name, ':') display(summary)
def tukeyTest(data, groups, alpha=0.05): '''Perform pairwise Tukey test for data by groups ''' # pairwise comparisons using Tukey's test, calculating p-values res = pairwise_tukeyhsd(data, groups, alpha) print('Summary of test:\n', res) # print(dir(results))# prints out all attributes of an object pVal = psturng(np.abs(res.meandiffs / res.std_pairs), len(res.groupsunique), res.df_total) print('p values of all pair-wise tests:\n', pVal) return res
def ANOVA(df, target_var_name, group_var_name): cmd = target_var_name + "~" + group_var_name work_df = pd.DataFrame() work_df[group_var_name] = df[group_var_name] work_df[target_var_name] = df[target_var_name].astype(float) work_df = work_df.dropna(axis=0) mod = ols(cmd, data=work_df).fit() aov_table = sm.stats.anova_lm(mod, typ=2) print(aov_table) res2 = pairwise_tukeyhsd(work_df[target_var_name], work_df[group_var_name]) print(res2)
def pairwise_tukey(reduced_dataframe, groups, p_value_threshold=0.05): """Perform Tukey Test (with multiple testing) on a dataframe with one column per group.""" reduced_dataframe = reduced_dataframe[groups] reduced_df_unpivot = reduced_dataframe\ .stack()\ .reset_index()\ .rename(columns={"level_1": 'Pop', 0: 'Prob'}) tukey = pairwise_tukeyhsd(endog=reduced_df_unpivot["Prob"], # Data groups=reduced_df_unpivot["Pop"], # Groups alpha=p_value_threshold) # Significance level return tukey
def ANOVA(df): target = df.columns[0] group_var_name = df.columns[1] cmd = target + "~" + group_var_name work_df = pd.DataFrame() work_df[group_var_name] = df[group_var_name] work_df = work_df.dropna(axis=0) mod = ols(cmd, data=work_df).fit() aov_table = sm.stats.anova_lm(mod, typ=2) print(aov_table) res2 = pairwise_tukeyhsd(work_df[target], work_df[group_var_name]) print(res2)
def stats_test(self, gain, param): if gain == 10: N = len(self.lin10[param]) data = np.hstack( (self.lin10[param], self.sqrt10[param], self.quad10[param])) if gain == 20: N = len(self.lin20[param]) data = np.hstack( (self.lin20[param], self.sqrt20[param], self.quad20[param])) group = ["linear"] * N + ["sqrt"] * N + ["quad"] * N res = pairwise_tukeyhsd(data, group) print(res)
def doTukey(data, multiComp): '''Do a pairwise comparison, and show the confidence intervals''' print((multiComp.tukeyhsd().summary())) # Calculate the p-values: res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment']) df = pd.DataFrame(data) numData = len(df) numTreatments = len(df.Treatment.unique()) dof = numData - numTreatments # Show the group names print((multiComp.groupsunique)) # Generate a print ------------------- # Get the data xvals = np.arange(3) res2 = pairwise_tukeyhsd(data['StressReduction'], data['Treatment']) errors = np.ravel(np.diff(res2.confint)/2) # Plot them plt.plot(xvals, res2.meandiffs, 'o') plt.errorbar(xvals, res2.meandiffs, yerr=errors, ls='o') # Put on labels pair_labels = multiComp.groupsunique[np.column_stack(res2._multicomp.pairindices)] plt.xticks(xvals, pair_labels) # Format the plot xlim = -0.5, 2.5 plt.hlines(0, *xlim) plt.xlim(*xlim) plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' + '\n Pairwise Mean Differences') # Save to outfile, and show the data outFile = 'multComp.png' C2_8_mystyle.printout_plain(outFile)
def multicomp(data, algos): groups = [] lindata = [] for i,a in enumerate(algos): groups.extend([i]*len(data[a])) lindata.extend(data[a]) groups = np.array(groups) lindata = np.array(lindata) res = pairwise_tukeyhsd(lindata, groups, 0.05) # print(res) return res
def get_pairwise_comparison_data(df, independent_variables_names, dependent_variables_names, significance_cutoff=0.05, NUM_GROUPS_CUTOFF=15): ''' datasetId independentVariables - list names, must be categorical dependentVariables - list names, must be numerical numBins - number of bins for the independent quantitative variables (if they exist) ''' considered_independent_variable_name = independent_variables_names[0] considered_dependent_variable_name = dependent_variables_names[0] # Only return pairwise comparison data if number of groups < THRESHOLD num_groups = len(get_unique(df[considered_independent_variable_name])) if num_groups > NUM_GROUPS_CUTOFF: return None hsd_result = pairwise_tukeyhsd(df[considered_dependent_variable_name], df[considered_independent_variable_name], alpha=significance_cutoff) hsd_raw_data = hsd_result.summary().data[1:] st_range = np.abs(hsd_result.meandiffs) / hsd_result.std_pairs p_values = psturng(st_range, len(hsd_result.groupsunique), hsd_result.df_total) hsd_headers = [ 'Group 1', 'Group 2', 'Group Mean Difference (2 - 1)', 'Lower Bound', 'Upper Bound', 'p-value', 'Distinct (p < %s)' % significance_cutoff ] hsd_data = [] for i in range(0, len(hsd_raw_data)): if isinstance(p_values, float): p_value = p_values else: p_value = p_values[i] if i < len(p_values) else None hsd_data_row = [ hsd_raw_data[i][0], hsd_raw_data[i][1], hsd_result.meandiffs[i], hsd_result.confint[i][0], hsd_result.confint[i][1], p_value, ( 'True' if (p_value <= significance_cutoff) else 'False' ) ] hsd_data.append(hsd_data_row) return { 'column_headers': hsd_headers, 'rows': hsd_data }
def test_ANOVA_and_T_HSD(array_of_arrays): ANOVA_f, ANOVA_p = scipy.stats.f_oneway(*array_of_arrays) if ANOVA_p < 0.05: x = [] label = [] for grp_num in range(len(array_of_arrays)): x.extend(array_of_arrays[grp_num]) label.extend([grp_num]*len(array_of_arrays[grp_num])) tukey_HSD_result = pairwise_tukeyhsd(np.array(x), np.array(label)) return ANOVA_p, tukey_HSD_result else: return ANOVA_p, False
F ~ F dist with k -1, n-k degs of freedom ''' stat, pval = stats.f_oneway(sample1, sample2, sample3, ...) ''' If we reject the null hypothesis, do all the pairwise comparisons to determine which means are different. Do: Bonferroni correction. If the overall alpha we want is 0.05, use alpha* = alpha / (k choose 2) as the significance level for each pairwise test ''' # another option: use statmodels from statsmodels.stats.multicomp import pairwise_tukeyhsd print pairwise_tukeyhsd(Data, Group) ''' Tukey: Create confidence intervals for all differences of means, see if the confidence interval contains 0 or not. It assumes independence of the observations being tested, as well as equal variation across observations (homoscedasticity). Tukey's test is essentially a Student's t-test, except that it corrects for family-wise error-rate. This is multicomparison. The output is like Multiple Comparison of Means - Tukey HSD,FWER=0.05 ================================================ group1 group2 meandiff lower upper reject
# First, do an one-way ANOVA df = pd.DataFrame(dta2) model = ols('StressReduction ~ C(Treatment)',df).fit() anovaResults = anova_lm(model) print anovaResults if anovaResults['PR(>F)'][0] < 0.05: print('One of the groups is different.') #Then, do the multiple testing mod = MultiComparison(dta2['StressReduction'], dta2['Treatment']) print mod.tukeyhsd()[0] # The following code produces the same printout res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment']) #print res2[0] # Show the group names print mod.groupsunique # Generate a print import matplotlib.pyplot as plt plt.plot([0,1,2], res2[1][2], 'o') plt.errorbar([0,1,2], res2[1][2], yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o') xlim = -0.5, 2.5 plt.hlines(0, *xlim) plt.xlim(*xlim) pair_labels = mod.groupsunique[np.column_stack(res2[1][0])] plt.xticks([0,1,2], pair_labels) plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' +
def test_shortcut_function(self): #check wrapper function res = pairwise_tukeyhsd(self.endog, self.groups, alpha=self.alpha) assert_almost_equal(res[1][4], self.res[1][4], decimal=14)
def anova(data_dict, language='en'): means_per_strat = {} join_vals = [] join_group = [] for key, v in data_dict.items(): means_per_strat[key] = [] for key2, v2 in v.items(): mean = sum(v2)/float(len(v2)) #print key, key2, mean means_per_strat[key].append(mean) join_vals.append(mean) join_group.append(key) #means_per_strat[key] = sum(v2)/float(len(v2)) sorted_ci = sorted(means_per_strat.items(), key=operator.itemgetter(1)) s_keys, s_values = zip(*sorted_ci) print 's_keys:', s_keys print 's_values:', s_values res = scipy.stats.f_oneway(*s_values) statistic = res[0] pvalue = res[1] alpha = 0.05 print 'ANOVA Analysis: F value:', statistic, 'P value:', pvalue, 'a:', alpha join_vals = np.asarray(join_vals) join_group = np.asarray(join_group) words = lang.get_vocabulary(language) for i in xrange(len(join_group)): v = join_group[i] join_group[i] = words[v] '''if v == 'E-greedy': join_group[i] = r'$\alpha$-greedy' elif v == 'E-Nash': join_group[i] = r'$\epsilon$-Nash' elif v == 'Reply-score': join_group[i] = 'Reply-last' elif v == 'Xelnaga': join_group[i] = 'Single choice' ''' #mc = MultiComparison(np.asarray(s_values), np.asarray(s_keys)) #result = mc.tukeyhsd() #print ' endog (len=%d): %s' % (len(join_vals), join_vals) #print 'groups (len=%d): %s' % (len(join_group), join_group) tukey = pairwise_tukeyhsd(endog=join_vals, groups=join_group, alpha=alpha) tukey.plot_simultaneous() # Plot group confidence intervals # #plt.vlines(x=49.57,ymin=-0.5,ymax=4.5, color="red") print tukey.summary() # See test summary
print 'AUC: ' + str(auc) from statsmodels.stats.multicomp import pairwise_tukeyhsd from scipy.stats.mstats import kruskalwallis, friedmanchisquare Array = ['NL','BPH','HGPIN','G3','G4','G5'] multiarea = magi_area['NL'].append(magi_area['BPH']) multiarea = multiarea.append(magi_area['HGPIN']) multiarea = multiarea.append(magi_area['G3']) multiarea = multiarea.append(magi_area['G4']) multiarea = multiarea.append(magi_area['G5']) multiarea = multiarea.dropna() multilesion = list() a = 0 while a < 6: column = Array[a] coldata = magi_area[column] coldata = coldata.dropna() for deet in coldata: multilesion.append(a) a = a + 1 print pairwise_tukeyhsd(multiarea, multilesion) print kruskalwallis(magi_area['NL'].dropna(), magi_area['BPH'].dropna(), magi_area['HGPIN'].dropna(), magi_area['G3'].dropna(), magi_area['G4'].dropna(), magi_area['G5'].dropna()) print kruskalwallis(magi_stain['NL'].dropna(), magi_stain['BPH'].dropna(), magi_stain['HGPIN'].dropna(), magi_stain['G3'].dropna(), magi_stain['G4'].dropna(), magi_stain['G5'].dropna())
except KeyboardInterrupt: pool.close() print('stopping all simulations...') finally: pool.terminate() pool.join() c = dict(Counter(completed).most_common()) for idx, sim in enumerate([s.__name__ for s in sim_list]): print('Test: {0}, Iterations {1}, Heuristic: {2}'.format(idx, c[sim], sim)) for pair in pairs: print('') print(pair) f, p = f_oneway(*[[i[1] for i in container[pair] if i[0] == sim] for sim in [s.__name__ for s in sim_list]]) print('F-stat: {0} at sig {1}: {2}'.format(str(round(f, 3)).ljust(7), str(round(p, 3)).ljust(7), ['NULL','REJECT'][p <= .05])) if p <= .05: dta2 = numpy.rec.array(container[pair], dtype=[('test', '|S100'), ('wins', int)]) print(pairwise_tukeyhsd(dta2['wins'], dta2['test']))
def main(): # Note: the statsmodels module is required here. from statsmodels.stats.multicomp import (pairwise_tukeyhsd, MultiComparison) from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm # Set up the data, as a structured array. # The first and last field are 32-bit intergers; the second field is an # 8-byte string. Note that here we can also give names to the individual # fields! dta2 = np.rec.array([ ( 1, 'mental', 2 ), ( 2, 'mental', 2 ), ( 3, 'mental', 3 ), ( 4, 'mental', 4 ), ( 5, 'mental', 4 ), ( 6, 'mental', 5 ), ( 7, 'mental', 3 ), ( 8, 'mental', 4 ), ( 9, 'mental', 4 ), ( 10, 'mental', 4 ), ( 11, 'physical', 4 ), ( 12, 'physical', 4 ), ( 13, 'physical', 3 ), ( 14, 'physical', 5 ), ( 15, 'physical', 4 ), ( 16, 'physical', 1 ), ( 17, 'physical', 1 ), ( 18, 'physical', 2 ), ( 19, 'physical', 3 ), ( 20, 'physical', 3 ), ( 21, 'medical', 1 ), ( 22, 'medical', 2 ), ( 23, 'medical', 2 ), ( 24, 'medical', 2 ), ( 25, 'medical', 3 ), ( 26, 'medical', 2 ), ( 27, 'medical', 3 ), ( 28, 'medical', 1 ), ( 29, 'medical', 3 ), ( 30, 'medical', 1 )], dtype=[('idx', '<i4'), ('Treatment', '|S8'), ('StressReduction', '<i4')]) # First, do an one-way ANOVA df = pd.DataFrame(dta2) model = ols('StressReduction ~ C(Treatment)',df).fit() anovaResults = anova_lm(model) print(anovaResults) if anovaResults['PR(>F)'][0] < 0.05: print('One of the groups is different.') #Then, do the multiple testing mod = MultiComparison(dta2['StressReduction'], dta2['Treatment']) print((mod.tukeyhsd().summary())) # The following code produces the same printout res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment']) #print res2[0] # Show the group names print((mod.groupsunique)) # Generate a print import matplotlib.pyplot as plt xvals = np.arange(3) plt.plot(xvals, res2.meandiffs, 'o') #plt.errorbar(xvals, res2.meandiffs, yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o') errors = np.ravel(np.diff(res2.confint)/2) plt.errorbar(xvals, res2.meandiffs, yerr=errors, ls='o') xlim = -0.5, 2.5 plt.hlines(0, *xlim) plt.xlim(*xlim) pair_labels = mod.groupsunique[np.column_stack(res2._multicomp.pairindices)] plt.xticks(xvals, pair_labels) plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' + '\n Pairwise Mean Differences') # Save to outfile outFile = 'MultComp.png' plt.savefig('MultComp.png', dpi=200) print(('Figure written to {0}'.format(outFile))) plt.show() # Instead of the Tukey's test, we can do pairwise t-test # First, with the "Holm" correction rtp = mod.allpairtest(stats.ttest_rel, method='Holm') print((rtp[0])) # and then with the Bonferroni correction print((mod.allpairtest(stats.ttest_rel, method='b')[0])) # Done this way, the variance is calculated at each comparison. # If you want the joint variance across all samples, you have to # use a few tricks:(http://jpktd.blogspot.co.at/2013/03/multiple-comparison-and-tukey-hsd-or_25.html) res2 = pairwise_tukeyhsd(dta2['StressReduction'], dta2['Treatment']) studentized_mean = res2.meandiffs studentized_variance = res2.variance t_stat = (studentized_mean / studentized_variance) / np.sqrt(2) dof = len(dta2) - len(mod.groupsunique) my_pvalues = stats.t.sf(np.abs(t_stat), dof) * 2 # two-sided # Now with the Bonferroni correction from statsmodels.stats.multitest import multipletests res_b = multipletests(my_pvalues, method='b') return res2.variance
anova = pairwise_tukeyhsd(endog=df['diffs'], groups=df['qualification_performance'], alpha=0.05) print anova.summary() f = open("../data/anovaResults.txt", "w") f.write(anova.__str__()) ################################################################################ pre_scores = "data/pre_test_responses" post_scores = "data/post_test_responses" learners = "data/class_data/class.json" import numpy as np import os import pandas as pd import scipy.stats as st import scipy as sc import json from statsmodels.stats.multicomp import pairwise_tukeyhsd import statsmodels.api as sm from statsmodels.formula.api import ols from copy import deepcopy pre_json = [] file_list = os.listdir(pre_scores) for f in file_list: of = open(os.path.join(pre_scores, f))
#(a4, p4) = readData(p + 'sda/48-36-24,3-2-2,003-001,15-15,10/', 'sl3_') #(a5, p5) = readData(p + 'sda-nolog/48-36-24,3-2-2,003-001,15-15,10/', 'snl3') #(a6, p6) = readData(p + 'mlp/48-36-24,001,15,10/', 'mlp3') (a7, p7) = readData(p + 'sda/36,3-2-2,003-001,15-15,10/', 'sl1_') (a8, p8) = readData(p + 'sda-nolog/36,3-2-2,003-001,15-15,10/', 'snl1') (a9, p9) = readData(p + 'mlp/36,001,15,10/', 'mlp1') #(a1, p1) = readData(p + '36,001,15,10/', 'lay1') #(a2, p2) = readData(p + '48-36-24,001,15,10/', 'lay3') #(a3, p3) = readData(p + '48-36,001,15,10/', 'lay2') a = np.vstack((p1, p2, p3, p7, p8, p9)) print 'Friedman chi-square test' print scipy.stats.friedmanchisquare(a1, a2, a3, a7, a8, a9) b = np.rec.array(a, dtype=[('val', '<f'), ('feature', '|S4')]) res2 = pairwise_tukeyhsd(b['val'], b['feature']) mod = MultiComparison(b['val'], b['feature']) print res2 import matplotlib.pyplot as plt plt.plot([0,1,2, 3, 4, 5], res2[1][2], 'o') plt.errorbar([0,1,2, 3, 4, 5], res2[1][2], yerr=np.abs(res2[1][4].T-res2[1][2]), ls='o') xlim = -0.5, 2.5 plt.hlines(0, *xlim) plt.xlim(*xlim) pair_labels = mod.groupsunique[np.column_stack(res2[1][0])] plt.xticks([0,1,2,3,4,5], pair_labels) plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' + '\n Pairwise Mean Differences') plt.show()