def anova_analysis(): data_dropped_na = data.dropna() mc1 = multi.MultiComparison(data_dropped_na[var_formula.get().split('~')[0]], data_dropped_na[var_formula.get().split('~')[1]]) result = mc1.tukeyhsd() t = result.summary().as_text() a_list = t.split('\n') cols = [col for col in a_list[2].split(' ') if col] df = pd.DataFrame(columns=cols) for i in range(4, len(a_list) - 1): items = [item for item in a_list[i].split(' ') if item] df.loc[i-4] = items formula = var_formula.get() mod = ols(formula, data=data_dropped_na).fit() # print(mod.summary()) aov_table = sm.stats.anova_lm(mod, typ=2) writer = pd.ExcelWriter('Analysis/ANOVA.xlsx') caption = pd.DataFrame(columns=[var_formula.get().split('~')[0]]) caption.to_excel(writer, sheet_name='Sheet1') aov_table.to_excel(writer, sheet_name='Sheet1', startcol=1) df.to_excel(writer, sheet_name='Sheet1', startcol=7) writer.save() os.startfile('Analysis\ANOVA.xlsx')
def test_using_anova(model, model_results, homoskedastic, df, dependent_var, *independent_vars, anova_type=2): """ Generate and ANOVA table and test the results using that """ output = '' results = {} aov_table = sm.stats.anova_lm(model_results, typ=anova_type, robust=None if homoskedastic else 'hc3') aov_table = augment_anova_table(aov_table) output += f"ANOVA\n{aov_table}\n\n" results['anova'] = aov_table # Perform multiple comparisons mc = multicomp.MultiComparison( df[dependent_var], df.loc[:, independent_vars].astype(str).agg(','.join, axis=1)) mc_results = mc.tukeyhsd() output += f"Tukey's HSD:\n{mc_results}\n\n" results['multiple'] = mc_results return output, results
def bonferroni(item): modelData = data[['Condition', item]].copy() modelData['Condition'] = modelData['Condition'].map(str) comp = mc.MultiComparison(modelData[item], modelData['Condition']) tbl, a1, a2 = comp.allpairtest(stats.ttest_ind, method="bonf") print(tbl)
def tukeys_test(self, Features=None, Clstrs=None, alpha=0.05): """ Tukey’s range test to compare means of all pairs of groups Parameters ---------- Features: 2D_array_like The arrays must have the same shape, except in the dimension Clstrs: array_like alpha: Value of FWER at which to calculate HSD Returns ------- A results class containing relevant data and some post-hoc calculations """ if Features is None: Features = self.__data.columns[:-1].copy() if Clstrs is None: Clstrs = self.__data["Clusters"].copy() Clstrs = Clstrs.dropna().unique().tolist() Clstrs.sort() for feature in Features: print("\n\n", feature,"\n") sub = self.__data[[feature, "Clusters"]].copy() sub = sub.dropna() mc = multi.MultiComparison(sub[sub["Clusters"].isin(Clstrs)][feature], sub[sub["Clusters"].isin(Clstrs)]["Clusters"]) res = mc.tukeyhsd(alpha) print(res.summary(),"\n\n")
def tukey(item): modelData = data[['Condition', item]].copy() modelData['Condition'] = modelData['condition'].map(str) comp = mc.MultiComparison(modelData[item], modelData['condition']) post_hoc_res = comp.tukeyhsd() post_hoc_res.summary() print(post_hoc_res.summary())
def compare_many(data): '''Multiple comparisons: Which one is different? ''' print('\n MultComp: --------------------------------------') # An ANOVA is a hypothesis test, and only answers the question: "Are all the groups # from the same distribution?" It does not tell you which one is different. # Since we now compare many different groups to each other, we have to adjust the # p-values to make sure that we don't get a Type I error. For this, we use the # statscom module "multicomp" mc = multicomp.MultiComparison(data['weight'], data['group']) # There are many ways to do multiple comparisons. Here, we choose # "Tukeys Honest Significant Difference" test # The first element of the output ("0") is a table containing the results print(mc.tukeyhsd().summary()) # Show the group names print(mc.groupsunique) # Generate a print ---------------- res2 = mc.tukeyhsd() # Get the data simple = False if simple: # You can do the plot with a one-liner, but then this does not - yet - look that great res2.plot_simultaneous() else: # Or you can do it the hard way, i.e. by hand: # Plot values and errorbars xvals = np.arange(3) plt.plot(xvals, res2.meandiffs, 'o') errors = np.ravel(np.diff(res2.confint)/2) plt.errorbar(xvals, res2.meandiffs, yerr=errors, fmt='o') # Set the x-limits xlim = -0.5, 2.5 # The "*xlim" passes the elements of the variable "xlim" elementwise into the function "hlines" plt.hlines(0, *xlim) plt.xlim(*xlim) # Plot labels (this is a bit tricky): # First, "np.array(mc.groupsunique)" makes an array with the names of the groups; # and then, "np.column_stack(res2[1][0])]" puts the correct groups together pair_labels = mc.groupsunique[np.column_stack(res2._multicomp.pairindices)] plt.xticks(xvals, pair_labels) plt.title('Multiple Comparison of Means - Tukey HSD, FWER=0.05' + '\n Pairwise Mean Differences') plt.show()
def anova_analysis(): if var_formula.get() == '': print_status("Warning: Formula is missing", 'red') return data_dropped_na = data.dropna() continuous_columns = [ col for col in data_dropped_na.columns if data_dropped_na[col].dtype != 'object' ] col = 1 writer = pd.ExcelWriter('../../Analysis/ANOVA.xlsx') if '.' in var_formula.get(): dependent_vars = continuous_columns else: dependent_vars = var_formula.get().split('~')[0].split('+') for dependent in dependent_vars: mc1 = multi.MultiComparison( data_dropped_na[dependent], data_dropped_na[var_formula.get().split('~')[1]]) result = mc1.tukeyhsd() t = result.summary().as_text() a_list = t.split('\n') cols = [col for col in a_list[2].split(' ') if col] df = pd.DataFrame(columns=cols) for i in range(4, len(a_list) - 1): items = [item for item in a_list[i].split(' ') if item] df.loc[i - 4] = items formula = dependent + '~' + var_formula.get().split('~')[1] mod = ols(formula, data=data_dropped_na).fit() aov_table = sm.stats.anova_lm(mod, typ=2) caption = pd.DataFrame(columns=[dependent]) caption.to_excel(writer, sheet_name='Sheet1', startrow=2, startcol=col) aov_table.to_excel(writer, sheet_name='Sheet1', startcol=col, startrow=3) df.to_excel(writer, sheet_name='Sheet1', startrow=3, startcol=col + 7) col += 16 writer.save() os.startfile('../../Analysis\ANOVA.xlsx') print_status('Status: Successful analysis', 'black')
def Stat_Test(df_gene, df_sg): comp_groups = raw_input( 'Choose the groups you want to compare. (e.g. good/bad or good/no ...)' ) tests = raw_input( 'Which statistical test you want to do? (e.g. unpaired_t_test or mann_whitney_test or fisher_exact_test or mann_whitney_test&fisher_exact_test ...)' ) group_list = comp_groups.split('/') test_list = tests.split('&') if len(group_list) == 2: group1 = group_list[0] group2 = group_list[1] list_g1 = list(df_sg.loc[df_sg.Group == group1, 'Sample']) list_g2 = list(df_sg.loc[df_sg.Group == group2, 'Sample']) df_gene = df_gene.loc[:, ['Gene'] + list_g1 + list_g2] df_gene[group1 + '_Sum'] = df_gene.loc[:, list_g1].sum(axis=1) df_gene[group2 + '_Sum'] = df_gene.loc[:, list_g2].sum(axis=1) df_gene = df_gene.loc[(df_gene[group1 + '_Sum'] != 0) & (df_gene[group2 + '_Sum'] != 0), :] for t in test_list: if t == 'unpaired_t_test': df_gene['unpaired_t_test'] = df_gene.apply( lambda row: unpaired_t_test(row, list_g1, list_g2), axis=1) elif t == 'mann_whitney_test': df_gene['mann_whitney_test'] = df_gene.apply( lambda row: mann_whitney_test(row, list_g1, list_g2), axis=1) elif t == 'fisher_exact_test': df_gene['fisher_exact_test'] = df_gene.apply( lambda row: fisher_exact_test(row, list_g1, list_g2), axis=1) else: print 'Please choose from unpaired_t_test, mann_whitney_test and fisher_exact_test' sys.exit(1) p_cor_flag = raw_input( 'Do you want to perform mutiple testing correction? (Y|N)') if p_cor_flag == 'Y': p_cor = raw_input( 'Which correction method do you want to choose? (e.g. bonferroni or fdr_bh or bonferroni&fdr_bh)' ) p_cor = p_cor.split('&') p_val = raw_input( 'Which p value do you want to correct? (e.g. unpaired_t_test or mann_whitney_test or fisher_exact_test' ) for cor in p_cor: df_gene[cor] = ssm.MultiComparison(df_gene[p_val], alpha=0.05, method=cor) return df_gene
def tukey(data=None, independent=None, dependent=None): pd.set_eng_float_format(accuracy=3, use_eng_prefix=False) independent = str(independent) dependent = str(dependent) if input_check_numerical_categorical(data, independent, dependent): return test = multi.MultiComparison(data[dependent], data[independent]) res = test.tukeyhsd() display(res.summary()) res.plot_simultaneous() return
def anova(wine_set): prepared_data = add_categ_quality(wine_set) model1 = smf.ols(formula="total_sulfur_dioxide ~ C(quality_mark)", data=prepared_data) results1 = model1.fit() print(results1.summary()) # sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']] print("\nMeans for total sulfur dioxide by quality marks of wine") print(sub.groupby('quality_mark').mean()) print("\nStandard deviations for total sulfur dioxide by quality marks of wine") print(sub.groupby('quality_mark').std(), '\n') # # # ------------- to perform Post hoc test (using Tukey's Honestly Significant Difference Test) ------------------------- mc1 = multi.MultiComparison(sub['total_sulfur_dioxide'], sub['quality_mark']) res1 = mc1.tukeyhsd() print(res1.summary())
def anova(wine_set): prepared_data = add_categ_quality(wine_set) model1 = smf.ols(formula="total_sulfur_dioxide ~ C(quality_mark)", data=prepared_data) results1 = model1.fit() print(results1.summary()) # sub = prepared_data[['total_sulfur_dioxide', 'quality_mark']] print("\nMeans for total sulfur dioxide by quality marks of wine") print(sub.groupby('quality_mark').mean()) print( "\nStandard deviations for total sulfur dioxide by quality marks of wine" ) print(sub.groupby('quality_mark').std(), '\n') mc1 = multi.MultiComparison(sub['total_sulfur_dioxide'], sub['quality_mark']) res1 = mc1.tukeyhsd() print(res1.summary())
def test_using_kruskal(df, dependent_var, *independent_vars, correction_method='bonf'): """ Test for the significance of factors using the non-parametric Kruskal-Wallis test followed by a Mann-Whitney U test with Bonferroni correction """ output = '' results = {} unique_values = df.groupby( list(independent_vars)).size().reset_index().rename( columns={0: 'count'}) test_data = [] for row in unique_values.itertuples(index=False): if len(independent_vars) > 1: selectors = [(df[v] == getattr(row, v)) for v in independent_vars] row_selector = np.logical_and(*selectors[:2]) for idx in range(2, len(independent_vars)): row_selector = np.logical_and(row_selector, selectors[idx]) else: v = independent_vars[0] row_selector = df[v] == getattr(row, v) test_data.append(df.loc[row_selector, dependent_var]) assert len(test_data) == unique_values.shape[0] test_results = spstats.kruskal(*test_data) output += f"Kruskal-Wallis test:\n{test_results.statistic}, {test_results.pvalue}\n\n" results['kruskal'] = test_results # Perform multiple comparisons with Bonferroni correction try: mc = multicomp.MultiComparison( df[dependent_var], df.loc[:, independent_vars].astype(str).agg(','.join, axis=1)) mc_results = mc.allpairtest(spstats.mannwhitneyu, method=correction_method) output += f"Pairwise Mann-Whitney U:\n{mc_results[0]}\n\n" results['multiple'] = mc_results[0] except Exception as e: print("ERROR:", e) return output, results
def anova_test(df): col1 = 'job' col2 = 'duration' duration_frame = df[[col1, col2]].copy() groups = duration_frame.groupby(col1) #Job-groups admin = groups.get_group('admin.')['duration'] bluecollar = groups.get_group('blue-collar')['duration'] student = groups.get_group('student')['duration'] housemaid = groups.get_group('housemaid')['duration'] services = groups.get_group('services')['duration'] unemployed = groups.get_group('unemployed')['duration'] entrepreneur = groups.get_group('entrepreneur')['duration'] selfemployed = groups.get_group('self-employed')['duration'] retired = groups.get_group('retired')['duration'] print('Admin:\n') print(admin.head()) F, p = ss.f_oneway(admin, bluecollar, student, housemaid, services, unemployed, entrepreneur, selfemployed, retired) print('{} {}\n'.format(F, p)) if p < 0.05: print( 'Null hypothesis rejected. Statistical difference found. Conduct post-hoc tests.' ) else: print('Not much difference found. Can accept null hypothesis.') #Conducting a Post-Hoc test duration_frame[col2] = duration_frame[col2].convert_objects( convert_numeric=True) mc = multi.MultiComparison(duration_frame[col2], duration_frame[col1]) result = mc.tukeyhsd() print(result.summary())
def run_Tukey(self, t, name_list, z): # Run tukey for data at t, # the name_list is the naming of the columns # z is the score to run Tukey total_list = [] total_list2 = [] for i, j in zip(t, name_list): total_list = np.concatenate((total_list, np.reshape(i, len(i)))) total_list2.extend([j] * len(i)) num_array = np.asarray(total_list) symbol_group = np.asarray(total_list2) di = {'Score': num_array, 'Group': symbol_group} middle_frame = pd.DataFrame(di) mcobj = ml.MultiComparison( pd.to_numeric(middle_frame.Score, errors="ignore"), middle_frame.Group) out = mcobj.tukeyhsd(z) return out
pokerhand_test=data_test['CLASS'] # put into a pandas dataFrame pokerhand_train=pd.DataFrame(pokerhand_train) pokerhand_test=pd.DataFrame(pokerhand_test) pokerhand_train.reset_index(level=0, inplace=True) # reset index merged_train_all=pd.merge(pokerhand_train, merged_train, on='index') # merge the pokerhand train with merged clusters sub1 = merged_train_all[['CLASS', 'cluster']].dropna() import statsmodels.formula.api as smf import statsmodels.stats.multicomp as multi # respone formula pokermod = smf.ols(formula='CLASS ~ cluster', data=sub1).fit() print (pokermod.summary()) print ('means for Poker hands by cluster') m1= sub1.groupby('cluster').mean() print (m1) print ('standard deviations for Poker hands by cluster') m2= sub1.groupby('cluster').std() print (m2) mc1 = multi.MultiComparison(sub1['CLASS'], sub1['cluster']) res1 = mc1.tukeyhsd() print(res1.summary())
# validate clusters in training data by examining cluster differences in GPA using ANOVA # first have to merge GPA with clustering variables and cluster assignment data gpa_data = data_clean['internetuserate'] # split GPA data into train and test sets gpa_train, gpa_test = train_test_split(gpa_data, test_size=.3, random_state=123) gpa_train1 = pd.DataFrame(gpa_train) gpa_train1.reset_index(level=0, inplace=True) merged_train_all = pd.merge(gpa_train1, merged_train, on='index') sub1 = merged_train_all[['internetuserate', 'cluster']].dropna() import statsmodels.formula.api as smf import statsmodels.stats.multicomp as multi gpamod = smf.ols(formula='internetuserate ~ (cluster)', data=sub1).fit() print(gpamod.summary()) print('means for internetuserate by cluster') m1 = sub1.groupby('cluster').mean() print(m1) print('standard deviations for internetuserate by cluster') m2 = sub1.groupby('cluster').std() print(m2) mc1 = multi.MultiComparison(sub1['internetuserate'], sub1['cluster']) res1 = mc1.tukeyhsd() print(res1.summary())
cols = ['sum_sq', 'df', 'mean_sq', 'F', 'PR(>F)', 'eta_sq', 'omega_sq'] aov = aov[cols] print(aov) print(" ") print(" POST-HOST TESTING ") # POST-HOC TESTING print(" ") # TUKEY HONESTLY SIGNIFFICANCE DIFFERENCE print("TUKEY HONESTLY SIGNIFFICANCE DIFFERENCE") #from statsmodels.stats.multicomp import (pairwise_tukeyhsd, # MultiComparison) import statsmodels.stats.multicomp as mc comp = mc.MultiComparison(df['Scores'], df['Method']) post_hoc_res = comp.tukeyhsd() post_hoc_res.summary() print(post_hoc_res) post_hoc_res.plot_simultaneous(ylabel="Method", xlabel="Score Differences") #BONFERRONI CORRECTION print(" ") print("BONFERRONI CORRECTION") print(" ") import statsmodels.stats.multicomp as mc comp = mc.MultiComparison(df['Scores'], df['Method']) tbl, a1, a2 = comp.allpairtest(stats.ttest_ind, method="bonf") print(tbl)