def ANOVA_Tukey(variable): new = diagnoses["Overall"][["diagnosis", variable]] new = new.astype({variable: 'float64'}) df_pivot = new.pivot(columns="diagnosis", values=variable) data = [df_pivot[diagnosis].dropna().values for diagnosis in df_pivot] f_val, p_val = stats.f_oneway(*data) anova_results = [f_val, p_val] tukey = pairwise_tukey(data=new, dv=variable, between="diagnosis") return anova_results, tukey
def tukey_pairwise_ph(tidy_df, hour_col: str = "Hour", dep_var: str = "Value", protocol_col: str = "Protocol"): """ :type protocol_col: object """ hours = tidy_df[hour_col].unique() ph_dict = {} for hour in hours: print(hour) hour_df = tidy_df.query("%s == '%s'" % (hour_col, hour)) ph = pg.pairwise_tukey(dv=dep_var, between=protocol_col, data=hour_df) pg.print_table(ph) ph_dict[hour] = ph ph_df = pd.concat(ph_dict) return ph_df
marker_ph_dict = {} for marker_label, marker_df in zip(marker_dict.keys(), marker_dict.values()): print(marker_label) # run anova curr_anova_marker = pg.anova( dv=dep_var, between=condition_col, data=marker_df ) pg.print_table(curr_anova_marker) curr_ph_marker = pg.pairwise_tukey( dv=dep_var, between=condition_col, data=marker_df ) pg.print_table(curr_ph_marker) marker_ph_dict[marker_label] = curr_ph_marker # save the files label_test_dir = marker_test_dir / marker_label if not os.path.exists(label_test_dir): os.mkdir(label_test_dir) curr_anova_marker.to_csv(label_test_dir / anova_str) curr_ph_marker.to_csv(label_test_dir / ph_str) marker_ph_df = pd.concat(marker_ph_dict) marker_ph_df.set_index(["A", "B"], append=True, inplace=True)
between='Subject Group') tdist_ph dist_Welch.to_csv( '/Users/labc02/Documents/PDCB_data/Behavior/EPM/Stats/total_dist_Welch_s2.csv' ) tdist_ph.to_csv( '/Users/labc02/Documents/PDCB_data/Behavior/EPM/Stats/total_dist_ph_s2.csv' ) opa_anova = pg.anova(data=epm_s2, dv='Time in Zone (%) - Open Arms', between='Subject Group') opa_anova opa_anova.to_csv( '/Users/labc02/Documents/PDCB_data/Behavior/EPM/Stats/opa_anova_s2.csv') opa_ph = pg.pairwise_tukey(data=epm_s2, dv='Time in Zone (%) - Open Arms', between='Subject Group') opa_ph.to_csv( '/Users/labc02/Documents/PDCB_data/Behavior/EPM/Stats/opa_ph_s2.csv') #%% epms2_fig, epms2_ax = plt.subplots(nrows=1, ncols=3, figsize=(7, 4)) sns.boxplot(x='Subject Group', y='Entries in Zone - Center', data=epm_s2, palette=['forestgreen', 'forestgreen', 'royalblue', 'royalblue'], showmeans=True, meanprops={ 'marker': '+', 'markeredgecolor': 'k' }, ax=epms2_ax[0])
def mixed_anova(self, stat_key, verbose=True, group_tukey=True, day_tukey=True): ''' :param stat_key: :return: ''' ko_sum_stat, ctrl_sum_stat = self.summary_stat_matrices(stat_key) df = {'ko_ctrl': [], 'day': [], stat_key: [], 'mouse': []} for m, mouse in enumerate(self.ko_mice): for day in self.days: df['ko_ctrl'].append(0) df['day'].append(day) df[stat_key].append(ko_sum_stat[m, day]) df['mouse'].append(mouse) for m, mouse in enumerate(self.ctrl_mice): for day in self.days: df['ko_ctrl'].append(1) df['day'].append(day) df[stat_key].append(ctrl_sum_stat[m, day]) df['mouse'].append(mouse) df = pd.DataFrame(df) results = {} aov = mixed_anova(data=df, dv=stat_key, between='ko_ctrl', within='day', subject='mouse') results['anova'] = aov if verbose: print('Mixed design ANOVA results') print(aov) if group_tukey: ko_ctrl_tukey = pairwise_tukey(data=df, dv=stat_key, between='ko_ctrl') results['ko_ctrl_tukey'] = ko_ctrl_tukey if verbose: print('PostHoc Tukey: KO vs Ctrl') print(ko_ctrl_tukey) if day_tukey: day_stats = [] print('PostHov Tukey on each day') for day in self.days: print('Day %d' % day) stats = pairwise_tukey(data=df[df['day'] == day], dv=stat_key, between='ko_ctrl') day_stats.append(stats) if verbose: print(stats) results['day_tukey'] = day_stats return results
#%% ### STATISTICAL TESTS ### # For subplots D and D we perform a one-way ANOVA to test the null hypothesis that # two or more groups have the same population mean. Here all the samples are independent # as coming from different FCGR3A haplotype and tested only in one condition # !pip install openpyxl ANOVA_top = anova( data=data_adcc, dv='top', # dependent variable between='FCGR3A') # between-subject identifier ANOVA_top.to_excel('../stats/ANOVA_top_figure3.xlsx') ANOVA_top_posthoc = pairwise_tukey(data=data_adcc, dv='top', between='FCGR3A') ANOVA_top_posthoc.to_excel('../stats/ANOVA_top_posthoc_figure3.xlsx') ANOVA_ec50 = anova( data=data_adcc, dv='EC50', # dependent variable between='FCGR3A') # between-subject identifier ANOVA_ec50.to_excel('../stats/ANOVA_ec50_figure3.xlsx') ANOVA_ec50_posthoc = pairwise_tukey(data=data_adcc, dv='EC50', between='FCGR3A') ANOVA_ec50_posthoc.to_excel('../stats/ANOVA_ec50_posthoc_figure3.xlsx') #%% ### FUNCTION FOR PLOTING THE STARS ###
# The output from this command provides us with two things. First, it shows us the result of a t-test for each of the dummy variables, which basically tell us whether each of the conditions separately differs from placebo; it appears that Drug 1 does whereas Drug 2 does not. However, keep in mind that if we wanted to interpret these tests, we would need to correct the p-values to account for the fact that we have done multiple hypothesis tests; we will see an example of how to do this in the next chapter. # # Remember that the hypothesis that we started out wanting to test was whether there was any difference between any of the conditions; we refer to this as an *omnibus* hypothesis test, and it is the test that is provided by the F statistic. The F statistic basically tells us whether our model is better than a simple model that just includes an intercept. In this case we see that the F test is highly significant, consistent with our impression that there did seem to be differences between the groups (which in fact we know there were, because we created the data). # %% ols_model = ols(formula='BPsys~ group', data=df) ols_result = ols_model.fit() aov_table = sm.stats.anova_lm(ols_result) aov_table # %% import pingouin as pg pg.anova(data=df, dv='BPsys',between='group', effsize="np2") # %% pg.pairwise_tukey(data=df, dv='BPsys', between='group') # %% [markdown] # ## Learning objectives # # After reading this chapter, you should be able to: # # * Describe the rationale behind the sign test # * Describe how the t-test can be used to compare a single mean to a hypothesized value # * Compare the means for two paired or unpaired groups using a two-sample t-test # # # ## Appendix # # ### The paired t-test as a linear model #
def test_pandas(self): """Test pandas method. """ # Test the ANOVA (Pandas) aov = df.anova(dv='Scores', between='Group', detailed=True) assert aov.equals( pg.anova(dv='Scores', between='Group', detailed=True, data=df)) aov3_ss1 = df_aov3.anova(dv='Cholesterol', between=['Sex', 'Drug'], ss_type=1) aov3_ss2 = df_aov3.anova(dv='Cholesterol', between=['Sex', 'Drug'], ss_type=2) aov3_ss2_pg = pg.anova(dv='Cholesterol', between=['Sex', 'Drug'], data=df_aov3, ss_type=2) assert not aov3_ss1.equals(aov3_ss2) assert aov3_ss2.round(3).equals(aov3_ss2_pg.round(3)) # Test the Welch ANOVA (Pandas) aov = df.welch_anova(dv='Scores', between='Group') assert aov.equals(pg.welch_anova(dv='Scores', between='Group', data=df)) # Test the ANCOVA aov = df_anc.ancova(dv='Scores', covar='Income', between='Method').round(3) assert (aov.equals( pg.ancova(data=df_anc, dv='Scores', covar='Income', between='Method').round(3))) # Test the repeated measures ANOVA (Pandas) aov = df.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True) assert (aov.equals( pg.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True, data=df))) # FDR-corrected post hocs with Hedges'g effect size ttests = df.pairwise_tests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges') assert (ttests.equals( pg.pairwise_tests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges', data=df))) # Pairwise Tukey tukey = df.pairwise_tukey(dv='Scores', between='Group') assert tukey.equals( pg.pairwise_tukey(data=df, dv='Scores', between='Group')) # Test two-way mixed ANOVA aov = df.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False) assert (aov.equals( pg.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False, data=df))) # Test parwise correlations corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman') corrs2 = pg.pairwise_corr(data=data, columns=['X', 'M', 'Y'], method='spearman') assert corrs['r'].equals(corrs2['r']) # Test partial correlation corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman') corrs2 = pg.partial_corr(x='X', y='Y', covar='M', method='spearman', data=data) assert corrs['r'].equals(corrs2['r']) # Test partial correlation matrix (compare with the ppcor package) corrs = data.iloc[:, :5].pcorr().round(3) np.testing.assert_array_equal(corrs.iloc[0, :].to_numpy(), [1, 0.392, 0.06, -0.014, -0.149]) # Now compare against Pingouin's own partial_corr function corrs = data[['X', 'Y', 'M']].pcorr() corrs2 = data.partial_corr(x='X', y='Y', covar='M') assert np.isclose(corrs.at['X', 'Y'], corrs2.at['pearson', 'r']) # Test rcorr (correlation matrix with p-values) # We compare against Pingouin pairwise_corr function corrs = df_corr.rcorr(padjust='holm', decimals=4) corrs2 = df_corr.pairwise_corr(padjust='holm').round(4) assert corrs.at['Neuroticism', 'Agreeableness'] == '*' assert (corrs.at['Agreeableness', 'Neuroticism'] == str(corrs2.at[2, 'r'])) corrs = df_corr.rcorr(padjust='holm', stars=False, decimals=4) assert (corrs.at['Neuroticism', 'Agreeableness'] == str(corrs2.at[2, 'p-corr'].round(4))) corrs = df_corr.rcorr(upper='n', decimals=5) corrs2 = df_corr.pairwise_corr().round(5) assert corrs.at['Extraversion', 'Openness'] == corrs2.at[4, 'n'] assert corrs.at['Openness', 'Extraversion'] == str(corrs2.at[4, 'r']) # Method = spearman does not work with Python 3.5 on Travis? # Instead it seems to return the Pearson correlation! df_corr.rcorr(method='spearman') df_corr.rcorr() # Test mediation analysis med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500) np.testing.assert_array_equal(med.loc[:, 'coef'].round(4).to_numpy(), [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
names = ["No_Vibration", "Symmetric_Vibration", "Asymmetric_Vibration", "Randomized_Asymmetric_Vibration"] df1.columns = names df2.columns = names import scipy.stats as stats # stats f_oneway functions takes the groups as input and returns F and P-value # fvalue, pvalue = stats.f_oneway(df1["No_Vibration"], df1["Symmetric_Vibration"], df1["Asymmetric_Vibration"], df1["Randomized_Asymmetric_Vibration"]) fvalue, pvalue = stats.f_oneway(df2["No_Vibration"], df2["Symmetric_Vibration"], df2["Asymmetric_Vibration"], df2["Randomized_Asymmetric_Vibration"]) print(fvalue, pvalue) # get ANOVA table as R like output import statsmodels.api as sm from statsmodels.formula.api import ols # reshape the d dataframe suitable for statsmodels package # d_melt = pd.melt(df1.reset_index(), id_vars=['index'], value_vars=names) d_melt = pd.melt(df2.reset_index(), id_vars=['index'], value_vars=names) # replace column names d_melt.columns = ['index', 'Vibration_Modes', 'value'] # Ordinary Least Squares (OLS) model model = ols('value ~ C(Vibration_Modes)', data=d_melt).fit() anova_table = sm.stats.anova_lm(model, typ=2) print(anova_table) from pingouin import pairwise_tukey m_comp = pairwise_tukey(data=d_melt, dv='value', between='Vibration_Modes') print(m_comp) w, pvalue = stats.shapiro(model.resid) print(w, pvalue)
print(key) # tidy data long_df = df.stack().reset_index() long_df.columns = stat_colnames part_df = long_df.query("%s == '%s'" % (time, part)) # do anova part_rm = pg.rm_anova(dv=dep_var, within=day, subject=anim, data=part_df) pg.print_table(part_rm) # do posthoc ph = pg.pairwise_tukey(dv=dep_var, between=day, data=part_df) pg.print_table(ph) ph_part_dict[key] = ph stage_test_dir = part_dir / key anova_file = stage_test_dir / "01_anova.csv" ph_file = stage_test_dir / "02_posthoc.csv" part_rm.to_csv(anova_file) ph.to_csv(ph_file) ph_part_df = pd.concat(ph_part_dict) ph_total_dict[part] = ph_part_df ph_total_df = pd.concat(ph_total_dict) ph_total_df = ph_total_df.reorder_levels([1, 0, 2])
#data_merged.to_csv(r'C:\Users\user\Desktop\FOCUS\behavioral\P_Merged_var.csv', index = None, header=True) # ANOVA - does correct reaction time differ between blocks? aov_corr_rt = anova(dv='corr_rt', between='blocks', data=data_merged) print(aov_corr_rt) rep_anov_alarm = pg.rm_anova(data=data_merged, dv='false_alarm', within='blocks', subject='participant', detailed=True) # follow-up pairwise comparison pairs_corr_rt = pairwise_tukey(dv='corr_rt', between='blocks', data=data_merged) print(pairs_corr_rt) #### ANOVA - does false alarms differ between blocks? aov_alarms = anova(dv='false_alarm', between='blocks', data=data_merged) print(aov_alarms) # follow-up pairwise comparison pairs_alarms = pairwise_tukey(dv='false_alarm', between='blocks', data=data_merged) print(pairs_alarms)
d_melt = pd.melt(data0.reset_index(), id_vars=['index'], value_vars=['No_Up', 'Single_Up', 'Double_Up']) # replace column names d_melt.columns = ['index', 'treatments', 'value'] # Ordinary Least Squares (OLS) model model = ols('value ~ C(treatments)', data=d_melt).fit() anova_table = sm.stats.anova_lm(model, typ=2) anova_table # pairwise comparision significance with HSD https://reneshbedre.github.io/blog/anova.html from pingouin import pairwise_tukey # perform multiple pairwise comparison (Tukey HSD) # for unbalanced (unequal sample size) data, pairwise_tukey uses Tukey-Kramer test print("Pairwise comparison with Tukey HSD") m_comp = pairwise_tukey(data=d_melt, dv='value', between='treatments') print(m_comp) plt.title("Regulation: no change", fontsize=16) plt.tight_layout() pdf.savefig(fig) ##### For the runs with changes ################### sigma_1 = 0.01875 sigma_1_list = sigma_1 * np.array([1 / 2, 2, 2**2, 2**3, 2**4]) mes = ["halved"] + ["times %d" % 2**i for i in (1, 2, 3, 4)] for i in range(len(sigma_1_list)): data1 = pd.read_csv("MI_10000traj_shift30_%d.csv" % i)
count_dir = marker_test_dir / "01_count" mean_dir = marker_test_dir / "02_mean" hist_dir = marker_test_dir / "03_hist" for dir in [count_dir, mean_dir, hist_dir]: if not os.path.exists(dir): os.mkdir(dir) curr_count = count_data_dict[curr_label] curr_mean = mean_data_dict[curr_label] curr_hist = hist_data_dict[curr_label] count = count_cols[-1] count_anova = pg.anova(dv=count, between=condition_col, data=curr_count) pg.print_table(count_anova) count_ph = pg.pairwise_tukey(dv=count, between=condition_col, data=curr_count) pg.print_table(count_ph) count_anova.to_csv(count_dir / anova_str) count_ph.to_csv(count_dir / ph_str) count_stats_dict[curr_label] = count_ph mean = mean_cols[-1] mean_anova = pg.anova(dv=mean, between=condition_col, data=curr_mean) pg.print_table(mean_anova) mean_ph = pg.pairwise_tukey(dv=mean, between=condition_col, data=curr_mean) pg.print_table(mean_ph) mean_anova.to_csv(mean_dir / anova_str) mean_ph.to_csv(mean_dir / ph_str) mean_stats_dict[curr_label] = mean_ph
(HWK_errors['Block'] == 'D1_2')].item() d2a = HWK_errors['Error mean'][(HWK_errors['Subject'] == ii) & (HWK_errors['Block'] == 'D2_1')].item() d2b = HWK_errors['Error mean'][(HWK_errors['Subject'] == ii) & (HWK_errors['Block'] == 'D2_2')].item() d3a = HWK_errors['Error mean'][(HWK_errors['Subject'] == ii) & (HWK_errors['Block'] == 'D3_1')].item() index_dict['Acquisition'].append(((d1a - d1b) + (d2a - d2b)) / 2) index_dict['Retrival'].append(((d1b - d2a) + (d2b - d3a)) / 2) kesner_index = pd.DataFrame(index_dict) kesner_index pg.anova(dv='Acquisition', between=['Genotype', 'Sex'], data=kesner_index, export_filename='kesneraov_acq') acq_tk = pg.pairwise_tukey(dv='Acquisition', between='Group', data=kesner_index) pg.anova(dv='Retrival', between=['Genotype', 'Sex'], data=kesner_index, export_filename='kesneraov_ret') acq_tk # Kesner indexes figure #%% kesner_ind = plt.figure(figsize=(9, 5)) plt.subplot(1, 2, 1) acq_ax = sns.barplot(x='Group', y='Acquisition', data=kesner_index, ci=68, capsize=.3, palette=['g', 'g', 'b', 'b']) plt.xticks(range(0, 4), ['Fem_KO', 'Male_KO', 'Fem_WT', 'Male_WT']) acq_ax.annotate('*', xy=(0.5, .93), xytext=(0.5, .91), xycoords='axes fraction', fontsize=18, ha='center', va='bottom', fontweight='bold', arrowprops=dict(arrowstyle='-[, widthB=6, lengthB=.1', lw=2, color='black')) acq_ax.annotate('**', xy=(0.25, .83), xytext=(0.25, .81), xycoords='axes fraction', fontsize=18, ha='center', va='bottom', fontweight='bold', arrowprops=dict(arrowstyle='-[, widthB=2, lengthB=.1', lw=2, color='black'))
hourly_test_dir = save_test_dir / "hour_prop" # prop 2 way rm test_rm = pg.rm_anova2(dv=dep_var, within=[day_col, hour_col], subject=anim, data=long_df) pg.print_table(test_rm) # prop post hoc ph_dict = {} for hour in hours: print(hour) hour_df = long_df.query("%s == '%s'" % (hour_col, hour)) ph = pg.pairwise_tukey(dv=dep_var, between=day_col, data=hour_df) pg.print_table(ph) ph_dict[hour] = ph hourly_ph_df = pd.concat(ph_dict) hr_anova_file = hourly_test_dir / anova_csv hr_ps_file = hourly_test_dir / ph_csv test_rm.to_csv(hr_anova_file) hourly_ph_df.to_csv(hr_ps_file) # can't do repeated measures on swe since missing values swe_test = delta_mean_masked.reset_index() swe_test = swe_test.iloc[:, [0, 2, 1, 3]].copy() swe_test.columns = stat_colnames swa_test_dir = save_test_dir / "SWA"
data2 = mean2 + np.random.randn(N2)*stdev data3 = mean3 + np.random.randn(N3)*stdev datacolumn = np.hstack((data1,data2,data3)) # group labels groups = ['1']*N1 + ['2']*N2 + ['3']*N3 # convert to a pandas dataframe df = pd.DataFrame({'TheData':datacolumn,'Group':groups}) df # In[ ]: pg.anova(data=df,dv='TheData',between='Group') # In[ ]: pg.pairwise_tukey(data=df,dv='TheData',between='Group') # In[ ]: df.boxplot('TheData',by='Group');