def rm_one_way_anova(dataset: Dataset, design, combined_data: CombinedData): data = dataset.data xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] between_subjs = [] within_subjs = [] for x in xs: if "between subjects" in design and design[ "between subjects"] == x.metadata[name]: between_subjs.append(x.metadata[name]) if "within subjects" in design and design[ "within subjects"] == x.metadata[name]: within_subjs.append(x.metadata[name]) # import pdb; pdb.set_trace() id = dataset.pid_col_name aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=id, within=within_subjs) # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels # import pdb; pdb.set_trace() res2way = aovrm2way.fit()
def analyzeData(results2): print('Accuracy') print( AnovaRM(data=results2, depvar='Accuracy', subject='Subject', within=['Condition'], aggregate_func='mean').fit()) MultiComp = MultiComparison(results2['Accuracy'], results2['Condition']) comp = MultiComp.allpairtest(sci.ttest_rel, method='bonf') print(comp[0]) print('Reaction Time') print( AnovaRM(data=results2, depvar='Reaction Time', subject='Subject', within=['Condition'], aggregate_func='mean').fit()) MultiComp = MultiComparison(results2['Reaction Time'], results2['Condition']) comp = MultiComp.allpairtest(sci.ttest_rel, method='bonf') print(comp[0])
def three_sample_test(sample1, sample2, sample3, test): if test == "anova": #parametric, between-subjects. test_stat, p_val = scipy.stats.f_oneway(sample1, sample2, sample3) elif test =="rm-anova": #parametric, within-subjects. data = {"response": [], "id": [], "group": []} for i in range(len(sample1)): data["response"].append(sample1[i]) data["id"].append(i) data["group"].append("A") data["response"].append(sample2[i]) data["id"].append(i) data["group"].append("B") data["response"].append(sample3[i]) data["id"].append(i) data["group"].append("C") df = pd.DataFrame(data=data) anova_rm = AnovaRM(df,depvar="response",subject="id",within=["group"]) res = anova_rm.fit() test_stat = res.anova_table['F Value'][0] p_val = res.anova_table['Pr > F'][0] elif test == "kruskal-wallis": #nonparametric, between-subjects. test_stat, p_val = scipy.stats.kruskal(sample1, sample2, sample3) elif test == "friedman": #nonparametric, within-subjects. test_stat, p_val = scipy.stats.friedmanchisquare(sample1, sample2, sample3) return test_stat,p_val
def continuous_paired_group_repeated_measures_anova(**kwargs): data_frame = kwargs["data_frame"] dependable_variable = kwargs["dependable_variable"] conditions = kwargs["conditions"] # make one condition out of multiple, otherwise not supported by AnovaRM sLength = len(data_frame[dependable_variable]) data_frame.loc[:, 'condition'] = pd.Series(np.empty(sLength), index=data_frame.index) if isinstance(conditions, list) and len(conditions) > 1: for name, group in data_frame.groupby(conditions): data_frame.loc[ data_frame.groupby(conditions).get_group(name).index, "condition"] = "_".join(name) data_frame.drop(columns=conditions) # todo: list in conditions not supported map to signle condition required, reduce subject size other wise #aovrm = AnovaRM(data_frame, depvar=dependable_variable, subject='test_index', within=conditions) aovrm = AnovaRM(data_frame[data_frame["test_index"] < 1000], dependable_variable, 'test_index', within=["condition"], aggregate_func=np.mean) res = aovrm.fit() print(res) # todo: how to read pvalue res.summary()... return True, 100
def rm(self, data, dep_var, subject, within, aggregate_func=None): """ Repeated Measures ANOVA Parameters: ---------- data: DataFrame Contains at least 3 columns that are 'dependent variable', 'subject', and 'factor' respectively. dep_var: str Name of the 'dependent variable' column. subject: str Name of the 'subject' column. (subject identifier) within: a list of strings Names of the at least one 'factor' columns. Return: ------ aov_table: DataFrame ANOVA table """ aov_rm = AnovaRM(data, dep_var, subject, within, aggregate_func=aggregate_func) aov_table = aov_rm.fit().anova_table return aov_table
def anova(diff1, diff2, recall, within_factors): r = 'recall' if not recall: r = 'recognition' diff1 = rearange(diff1, 'short', within_factors = within_factors, recall = recall) diff2 = rearange(diff2, 'short', within_factors = within_factors, recall = recall) diffs_for_anova = pd.concat([diff1,diff2]) #perform anova anovarm = AnovaRM(diffs_for_anova, 'performance', 'sub_id', within = within_factors, aggregate_func = 'mean') res = anovarm.fit() #rounded p value p = round(res.anova_table['Pr > F'][0],4) print(F'ANOVA ON DIFFERENCES in memory performance - {r}', res) return diff1, diff2, diffs_for_anova
def rm_one_way(xs, y, key, df): aovrm2way = AnovaRM(df, depvar=y, subject=key, within=xs, aggregate_func='mean') res2way = aovrm2way.fit() return str(res2way)
def rm_one_way(xs, y, key, df): between_subjs = [] within_subjs = [] aovrm2way = AnovaRM(df, depvar=y, subject=key, within=xs, aggregate_func='mean') res2way = aovrm2way.fit() # import pdb; pdb.set_trace() return str(res2way)
def test_repeated_measures_aggregation(): df1 = AnovaRM(data, 'DV', 'id', within=['A', 'B', 'D']).fit() df2 = AnovaRM(data.append(data), 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean).fit() assert_frame_equal(df1.anova_table, df2.anova_table)
def test_repeated_measures_aggregation_one_subject_duplicated(): df1 = AnovaRM(data, 'DV', 'id', within=['A', 'B', 'D']).fit() df2 = AnovaRM(data.append(data.loc[data['id'] == '1', :]).reset_index(), 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean).fit() assert_frame_equal(df1.anova_table, df2.anova_table)
def test_repeated_measures_aggregation(): df1 = AnovaRM(data, 'DV', 'id', within=['A', 'B', 'D']).fit() double_data = pd.concat([data, data], axis=0) df2 = AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean).fit() assert_frame_equal(df1.anova_table, df2.anova_table)
def AnovaRM_with_post_hoc(data, dep_var, subject, within, only_significant = False): # One within anova = AnovaRM(data, dep_var, subject, within) print(anova.fit()) # Post-hoc with ttest pairwise_ttest_rel(data, dep_var, within = within, only_significant = only_significant )
def rm_one_way_anova(dataset: Dataset, predictions, design, combined_data: CombinedData): data = dataset.data xs = combined_data.get_explanatory_variables() ys = combined_data.get_explained_variables() assert (len(ys) == 1) y = ys[0] between_subjs = [] within_subjs = [] for x in xs: if "between subjects" in design and design[ "between subjects"] == x.metadata[name]: between_subjs.append(x.metadata[name]) if "within subjects" in design and design[ "within subjects"] == x.metadata[name]: within_subjs.append(x.metadata[name]) if predictions: if isinstance(predictions[0], list): prediction = predictions[0][0] else: prediction = predictions[0] else: prediction = None key = dataset.pid_col_name aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=key, within=within_subjs, aggregate_func='mean') # aovrm2way = AnovaRM(data, depvar=y.metadata[name], subject=dataset.pid_col_name, within=within_subjs, between=between_subjs) # apparently not implemented in statsmodels res2way = aovrm2way.fit() result_df = res2way.anova_table col_name = x.metadata[name] for row_name in result_df.index: if row_name == col_name: row_data = result_df.loc[row_name] test_statistic = row_data['F Value'] p_val = row_data['Pr > F'] dof = (row_data['Num DF'], row_data['Den DF']) test_result = TestResult(name=rm_one_way_anova_name, test_statistic=test_statistic, p_value=p_val, prediction=prediction, dof=dof, alpha=combined_data.alpha, table=result_df, x=x, y=y) return test_result
def rm_anova(data=None, subject=None, within=None, between=None, dv=None): """ Returns ANOVA table as dataframe. """ anova = AnovaRM(data=data, subject=subject, within=within, between=between, depvar=dv) fit = anova.fit() return fit.anova_table
def getRMAnova(dataSet, labels, verbose=False): tlabels = np.concatenate([[labels[j] for _,y in enumerate(x) ]for j,x in enumerate(dataSet)]) concatData = np.concatenate(dataSet) ids = np.concatenate([np.arange(len(x)) for _,x in enumerate(dataSet)]) d = {'id':ids, 'rt':concatData, 'cond':tlabels} df = pd.DataFrame(d) anovarm = AnovaRM(df, 'rt', 'id', within=['cond']) res = anovarm.fit() if verbose: print (res.summary()) return res
def test_repeated_measures_aggregate_func(): assert_raises(ValueError, AnovaRM, data.append(data), 'DV', 'id', within=['A', 'B', 'D']) m1 = AnovaRM(data.append(data), 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean) m2 = AnovaRM(data.append(data), 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.median) assert_raises(AssertionError, assert_equal, m1.aggregate_func, m2.aggregate_func) assert_frame_equal(m1.fit().anova_table, m2.fit().anova_table)
def test_repeated_measures_aggregate_func_mean(): m1 = AnovaRM(data.append(data), 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean) m2 = AnovaRM(data.append(data), 'DV', 'id', within=['A', 'B', 'D'], aggregate_func='mean') assert_equal(m1.aggregate_func, m2.aggregate_func)
def rank_multiple_normal_homoscedastic(data, alpha, verbose, order, effect_size, force_mode): """ Analyzes data using repeated measures ANOVA and Tukey HSD. """ stacked_data = data.stack().reset_index() stacked_data = stacked_data.rename(columns={ 'level_0': 'id', 'level_1': 'treatment', 0: 'result' }) anova = AnovaRM(stacked_data, 'result', 'id', within=['treatment']) pval = anova.fit().anova_table['Pr > F'].iat[0] if verbose: if pval >= alpha: print( "Fail to reject null hypothesis that there is no difference between the distributions (p=%f)" % pval) else: print( "Rejecting null hypothesis that there is no difference between the distributions (p=%f)" % pval) print( "Using Tukey HSD post hoc test.", "Differences are significant if the confidence intervals of the mean values are not overlapping." ) multicomp = MultiComparison(stacked_data['result'], stacked_data['treatment']) tukey_res = multicomp.tukeyhsd() # must create plot to get confidence intervals tukey_res.plot_simultaneous() # delete plot instead of showing plt.close() rankdf, effsize_method, reorder_pos = _create_result_df_skeleton( data, None, True, order, effect_size=effect_size, force_mode=force_mode) for population in rankdf.index: mean = data.loc[:, population].mean() ci_range = tukey_res.halfwidths[data.columns.get_loc(population)] lower, upper = mean - ci_range, mean + ci_range rankdf.at[population, 'ci_lower'] = lower rankdf.at[population, 'ci_upper'] = upper return _ComparisonResult(rankdf, pval, None, 'anova', 'tukeyhsd', effsize_method, reorder_pos)
def test_repeated_measures_aggregate_func_mean(): double_data = pd.concat([data, data], axis=0) m1 = AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean) m2 = AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], aggregate_func='mean') assert_equal(m1.aggregate_func, m2.aggregate_func)
def anovaRM(self, depvar, subject, within=None, between=None, aggregate_func=None): """ Repeated measures Anova using least squares regression The full model regression residual sum of squares is used to compare with the reduced model for calculating the within-subject effect sum of squares. Currently, only fully balanced within-subject designs are supported. Calculation of between-subject effects and corrections for violation of sphericity are not yet implemented. Parameters ---------- depvar: str The dependent variable in data subject: str Specify the subject id within: list[str] The within-subject factors between: list[str] The between-subject factors, this is not yet implemented aggregate_func: {None, ‘mean’, callable} If the data set contains more than a single observation per subject and cell of the specified model, this function will be used to aggregate the data before running the Anova. None (the default) will not perform any aggregation; ‘mean’ is s shortcut to numpy.mean. An exception will be raised if aggregation is required, but no aggregation function was specified. Returns ---------- AnovaResults instance Notes ---------- This implementation currently only supports fully balanced designs. If the data contain more than one observation per subject and cell of the design, these observations need to be aggregated into a single observation before the Anova is calculated, either manually or by passing an aggregation function via the aggregate_func keyword argument. Note that if the input data set was not balanced before performing the aggregation, the implied heteroscedasticity of the data is ignored. References ---------- Rutherford, Andrew. Anova and ANCOVA: a GLM approach. John Wiley & Sons, 2011. """ res = AnovaRM(self.__data, depvar, subject, within, between, aggregate_func) res = res.fit() print(res)
def run_anova(self): self.aov = AnovaRM(self.df_long, depvar="Minutes", subject="ID", within=["Group"]) self.aov_results = self.aov.fit() print("\n" + "======================================== MAIN EFFECTS ========================================") print("\n", self.aov_results.anova_table) self.tukey = "n.s." if self.aov_results.anova_table["Pr > F"][0] <= 0.05: print("") tukey_data = MultiComparison(self.df_long["Minutes"], self.df_long["Group"]) self.tukey = tukey_data.tukeyhsd(alpha=0.05) print("============================================ POST HOC ===========================================") print("\n", self.tukey.summary())
def test_repeated_measures_aggregate_compare_with_ezANOVA(): # Results should reproduces those from R's `ezANOVA` (library ez). ez = pd.DataFrame( { 'F Value': [ 8.7650709, 8.4985785, 20.5076546, 0.8457797, 21.7593382, 6.2416695, 5.4253359 ], 'Num DF': [1, 2, 1, 2, 1, 2, 2], 'Den DF': [7, 14, 7, 14, 7, 14, 14], 'Pr > F': [ 0.021087505, 0.003833921, 0.002704428, 0.450021759, 0.002301792, 0.011536846, 0.018010647 ] }, index=pd.Index(['A', 'B', 'D', 'A:B', 'A:D', 'B:D', 'A:B:D'])) ez = ez[['F Value', 'Num DF', 'Den DF', 'Pr > F']] double_data = pd.concat([data, data], axis=0) df = (AnovaRM(double_data, 'DV', 'id', within=['A', 'B', 'D'], aggregate_func=np.mean).fit().anova_table) assert_frame_equal(ez, df, check_dtype=False)
def fit(self, data, depvar, subject, within=None, between=None, aggregate_func=None): """Estimate the model and compute ANOVA table. Parameters ---------- data : DataFrame depvar : str The dependent variable in `data` subject : str Specify the subject id within : list[str] The within-subject factors between : list[str] The between-subject factors, this is not yet implemented aggregate_func : {None, 'mean', callable} If the data set contains more than a single observation per subject and cell of the specified model, this function will be used to aggregate the data before running the Anova. `None` (the default) will not perform any aggregation; 'mean' is s shortcut to `numpy.mean`. An exception will be raised if aggregation is required, but no aggregation function was specified. Returns ------- results : AnovaResults instance Raises ------ ValueError If the data need to be aggregated, but `aggregate_func` was not specified. """ anova = AnovaRM(data=data, depvar=depvar, subject=subject, within=within, between=between, aggregate_func=aggregate_func) self._results = anova.fit()
def calculate_anova(df): pvals = [] num_subjs = 9 for vox in tqdm(df): vox = calculate_avg_across_models(vox) vox = np.append( vox, np.reshape(np.array(list(range(1, num_subjs + 1))), (num_subjs, 1)), 1) vox = pd.DataFrame(vox, columns=['bert', 'baseline', 'opennmt', 'subject']) sub_vox = vox.melt(id_vars=["subject"], var_name="model", value_name="corr") aovrm2way = AnovaRM(sub_vox, "corr", "model", within=["subject"]) mod = aovrm2way.fit() pval = mod.summary().tables[0]["Pr > F"]["subject"] pvals.append(pval) return pvals
def test_single_factor_repeated_measures_anova(): """ Testing single factor repeated measures anova Results reproduces R `ezANOVA` function from library ez """ df = AnovaRM(data.iloc[:16, :], 'DV', 'id', within=['B']).fit() a = [[1, 7, 22.4, 0.002125452]] assert_array_almost_equal(df.anova_table.iloc[:, [1, 2, 0, 3]].values, a, decimal=5)
def anova(data): data = pd.melt(data, id_vars='sub_id', var_name='cond', value_name='performance') # #perform anova anovarm = AnovaRM(data, 'performance', 'sub_id', within=['cond']) res = anovarm.fit() #rounded p value p = round(res.anova_table['Pr > F'][0], 2) F = round(res.anova_table['F Value'][0], 2) # print(F'ANOVA ON DIFFERENCES in memory performance - {r}', res) print(F, p) res2 = [[F, p]] return res2
def test_two_factors_repeated_measures_anova(): """ Testing two factors repeated measures anova Results reproduces R `ezANOVA` function from library ez """ df = AnovaRM(data.iloc[:48, :], 'DV', 'id', within=['A', 'B']).fit() a = [[1, 7, 40.14159, 3.905263e-04], [2, 14, 29.21739, 1.007549e-05], [2, 14, 17.10545, 1.741322e-04]] assert_array_almost_equal(df.anova_table.iloc[:, [1, 2, 0, 3]].values, a, decimal=5)
def test_three_factors_repeated_measures_anova(): """ Testing three factors repeated measures anova Results reproduces R `ezANOVA` function from library ez """ df = AnovaRM(data, 'DV', 'id', within=['A', 'B', 'D']).fit() a = [[1, 7, 8.7650709, 0.021087505], [2, 14, 8.4985785, 0.003833921], [1, 7, 20.5076546, 0.002704428], [2, 14, 0.8457797, 0.450021759], [1, 7, 21.7593382, 0.002301792], [2, 14, 6.2416695, 0.011536846], [2, 14, 5.4253359, 0.018010647]] assert_array_almost_equal(df.anova_table.iloc[:, [1, 2, 0, 3]].values, a, decimal=5)
def calculate_anova(args, all_corrs): dims = all_corrs[0][0].shape pvals = np.zeros((dims[0], dims[1], dims[2])) num_layers = 12 num_subjs = 9 print("LEN: " + str(len(all_corrs))) print("DIMS: " + str(all_corrs[0][0].shape)) for i in tqdm(range(dims[0])): for j in range(dims[1]): for k in range(dims[2]): vals_across_subjs_and_layers = [] for subj in range(num_subjs): for layer in range(num_layers): val = all_corrs[subj][layer][i][j][k] vals_across_subjs_and_layers.append( all_corrs[subj][layer][i][j][k]) # make dataframe df = pd.DataFrame({ 'voxel': np.ones(len(vals_across_subjs_and_layers)), 'corr': vals_across_subjs_and_layers, 'subject': np.repeat(list(range(1, num_subjs + 1)), num_layers), 'layer': np.tile(list(range(1, num_layers + 1)), num_subjs) }) aovrm2way = AnovaRM(df, 'voxel', 'corr', within=['subject', 'layer']) mod = aovrm2way.fit() pval = mod.summary().tables[0]["Pr > F"]["subject:layer"] pvals[i][j][k] = pval return pvals
def anova_group(means, recog): t = 'recall' if recog: t = 'recog' #melt df means = pd.melt(means,id_vars = 'sub_id', var_name = 'cond', value_name = 'performance') anovarm = AnovaRM(means, 'performance', 'sub_id', within = ['cond']) res = anovarm.fit() p = round(res.anova_table['Pr > F'][0],4) print(F'reaction times anova ({t})', res) return p
def rlrlRMANOVA(mes): # RL-RL ANOVA RM aexps = expandEvals(mes) print('********** RL Controller Error RMANOVA **********') aexps['s_id'] = (np.array(aexps.index.values.tolist()) + 1).tolist() avrm = AnovaRM(aexps, 'error', 's_id', within=['model']) rma = avrm.fit() print(rma) print('********** RL Controller Error RMANOVA **********') aexps['s_id'] = (np.array(aexps.index.values.tolist()) + 1).tolist() avrm = AnovaRM(aexps, 'rise_time', 's_id', within=['model']) rma = avrm.fit() print(rma) print('********** RL Controller Error RMANOVA **********') aexps['s_id'] = (np.array(aexps.index.values.tolist()) + 1).tolist() avrm = AnovaRM(aexps, 'energy', 's_id', within=['model']) rma = avrm.fit() print(rma)