def friedman_test(): df_acc = pd.read_csv(csv_file_name_acuracia) df_sim = pd.read_csv(csv_file_name_similaridade) #renomeia dataframes para que possam ser concatenados df_acc = df_acc[['acc_min', 'classifier_type', 'dataset', 'acc_mean']].rename(columns={'acc_min': 'parametro', 'acc_mean': 'measurement'}) df_sim = df_sim[['qtde_classifiers', 'classifier_type', 'dataset', 'acc_mean']].rename(columns={'qtde_classifiers': 'parametro', 'acc_mean': 'measurement'}) df_acc['metodo'] = 'acuracia' df_sim['metodo'] = 'similaridade' #concatena os dataframes df_all = pd.concat([df_acc, df_sim]) #transforma metodo, classifier_type e parametro em uma coluna só df_all = df_all.astype({'parametro': str}) df_all = df_all.set_index(keys=['metodo', 'classifier_type', 'parametro']) df_all.index = df_all.index.map('-'.join) df_all.reset_index(inplace=True) #cada algoritmo diferente vira uma coluna, cada linha corresponde a um dataset, e os valores são a acurácia df_all_pivoted = df_all.pivot(index='dataset', columns='index', values='measurement') #teste de friedman statistic, pvalue = friedmanchisquare(*df_all_pivoted.values.tolist()) print(f'p-value={pvalue}') #teste de nemenyi nemenyi = posthoc_nemenyi_friedman(df_all, melted=True, group_col='index', block_col='dataset', y_col='measurement') nemenyi.to_csv('resultado_nemenyi.csv')
def calcstats(file, name, type): dat = pd.read_excel(file) df1 = dat[["Anger", "Fear", "Joy", "Sadness"]] friedman = st.friedmanchisquare(*df1.values) print(friedman) p_values = hocs.posthoc_nemenyi_friedman(df1.T) p_values.to_excel(r"Final1\results_" + type + "_" + name + "_p_values.xlsx") ranks = pd.DataFrame(columns=df1.keys()) for key in df1.keys(): ranks[key] = df1[key].rank(ascending=False) df1["Totals"] = df1.sum(axis=1) / 4 df1["Ranks"] = ranks.mean(axis=1) df1 = df1.sort_values(by=["Ranks"]) df1 = df1.reset_index() R_1 = df1["Ranks"].iloc[0] df1["P-values"] = df1.apply(lambda row: min( test_pairs(4, len(df1.index), R_1, row["Ranks"]) * (row.name + 1), 1), axis=1) for index, row in df1.iterrows(): if index != len(df1.index) - 1: df1["P-values"].iloc[index] = max( df1["P-values"].iloc[index + 1:].max(), df1["P-values"].iloc[index]) df1 = df1.sort_values(by=["index"]) df1 = df1.reset_index(drop=True) df1.to_excel(r"Final1\results_" + type + "_" + name + ".xlsx")
def friedman_posthoc_tests(experiment_pivot_df): """Returns p-value tables for various Friedman posthoc tests. Results should considered only if Friedman test rejects null hypothesis. """ posthoc_tests = {} posthoc_tests['conover'] = sp.posthoc_conover_friedman(experiment_pivot_df) posthoc_tests['nemenyi'] = sp.posthoc_nemenyi_friedman(experiment_pivot_df) return posthoc_tests
def do_statistical_test(): """ do the friedman and post-hoc tests that include the meta-learning results """ df_results = pd.read_csv('meta_learning/average_results.csv') model_names = df_results.columns[1:] t_stat, p_val = friedmanchisquare(*[df_results[i] for i in model_names]) print('\nfriedman test p-val = %s' % p_val) post_hoc_p_vals = posthoc_nemenyi_friedman(df_results.drop(columns='dataset').to_numpy()) post_hoc_p_vals.columns = model_names print('\npost hoc p-vals:\n%s' % post_hoc_p_vals) post_hoc_p_vals.to_csv('meta_learning/post_hoc.csv', index=False)
def do_nemenyi_test(ranked_data, plot=False): ranks_per_dataset = ranked_data.iloc[:, 1:] if plot: names = list(ranked_data.columns)[1:] avg_ranks = ranks_per_dataset.mean(axis=0) cd = Orange.evaluation.compute_CD( avg_ranks, ranked_data.shape[0], alpha='0.05', test='nemenyi') Orange.evaluation.graph_ranks(avg_ranks, names, cd=cd, width=10, textspace=1.5) plt.show() return posthoc_nemenyi_friedman(ranks_per_dataset)
def runFriedmanPython_array(data): import scipy.stats as ss import scikit_posthocs as sp p_statistic, p_value = ss.friedmanchisquare(*data.T) # https://scikit-posthocs.readthedocs.io/en/latest/generated/scikit_posthocs.posthoc_nemenyi_friedman/#id2 # P. Nemenyi (1963) Distribution-free Multiple Comparisons. Ph.D. thesis, Princeton University. pc = sp.posthoc_nemenyi_friedman(data) return FriedmanResult("", p_value, None, cmp_matrix=pc, binary_cmp_matrix=False, cmp_method="nemenyi")
def benchmark_average(benchmark, posthocs=False): sums = None for prev_line in benchmark: line = prev_line[1:] if not isinstance(line[0], str): if sums is None: sums = [[value] for value in line] else: for values, value in zip(sums, line): values.append(value) yield prev_line if sums is not None: yield ["Average"] + [sum(values) / len(values) for values in sums] if sums is not None and posthocs: import scikit_posthocs as ph print(ph.posthoc_nemenyi_friedman(np.array(sums).T))
def tabulate(self, ensemble=False): offset = 2 if ensemble else 1 column_name = 'Non-parametric (Friedman, Nemenyi) (New)' if ensemble else 'Non-parametric (Friedman, Nemenyi)' self.weights = [] for query_ in self.data: if ensemble: # Appending Ensemble model as new row to each query results = [ sum(a * b for a, b in zip(self.ensemble, c)) for c in np.array(self.data[query_].iloc[1:( len(self.methods) + 1), 1:-2].values.tolist()).T ] new_row = pd.Series( dict( zip(self.data[query_].columns, ['Ensemble-model'] + results + [query_, '0']))) self.data[query_] = self.data[query_].append(new_row, ignore_index=True) f_data = self.data[query_].iloc[1:(len(self.methods) + offset), 1:-1 * offset].values.tolist() p = friedmanchisquare(*f_data)[1] ph_data = scikit_posthocs.posthoc_nemenyi_friedman( np.array(f_data).T) #ph_min = [math.sqrt(-1*np.prod(ph_data[i])) for i in range(len(ph_data[0]))] ph_min = [max(ph_data[i]) for i in range(len(ph_data[0]))] weights = [n / sum(ph_min) for n in ph_min] self.data[query_][column_name] = ['p=%f' % p] + weights self.table = self.table.append(self.data[query_]) self.weights.append(weights) if ensemble: self.table = self.table[ ['Queries', 'Model'] + ['Article{}/Ranking'.format(i + 1) for i in range(self.k)] + ['Non-parametric (Friedman, Nemenyi)'] + ['Non-parametric (Friedman, Nemenyi) (New)']].set_index( 'Queries', append=True).swaplevel(0, 1) else: self.table = self.table[ ['Queries', 'Model'] + ['Article{}/Ranking'.format(i + 1) for i in range(self.k)] + ['Non-parametric (Friedman, Nemenyi)']].set_index( 'Queries', append=True).swaplevel(0, 1) self.ensembles = np.asarray(self.weights).T.tolist() return self.print_results()
def friedman_test_acuracia(): df = pd.read_csv(csv_file_name_acuracia) df = df[['acc_min', 'classifier_type', 'dataset', 'acc_mean']] #transforma metodo, classifier_type e parametro em uma coluna só df = df.astype({'acc_min': str}) df = df.set_index(keys=['classifier_type', 'acc_min']) df.index = df.index.map('_'.join) df.reset_index(inplace=True) #teste de friedman df_pivoted = df.pivot(index='dataset', columns='index', values='acc_mean') statistic, pvalue = friedmanchisquare(*df_pivoted.values.tolist()) print(f'p-value={pvalue}') #teste de nemenyi nemenyi = posthoc_nemenyi_friedman(df, melted=True, group_col='index', block_col='dataset', y_col='acc_mean') nemenyi.to_csv('resultado_nemenyi_acuracia.csv')
def SignificancePlot(self, methods=None, metric='MAE'): # -- Method(s) if methods == None: methods = self.methods else: if set(methods) <= set(self.methods): raise ("Some method is wrong!") else: self.methods = methods # -- set metric self.metric = metric self.mag = self.metricSort[metric] # -- get data from dataset(s) if self.multidataset: Y = self.__getData() else: Y = self.__getDataMono() # -- Significance plot, a heatmap of p values methodNames = [x.upper() for x in self.methods] Ypd = pd.DataFrame(Y, columns=methodNames) ph = sp.posthoc_nemenyi_friedman(Ypd) cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef'] heatmap_args = { 'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.85, 0.35, 0.04, 0.3] } plt.figure(figsize=(5, 4)) sp.sign_plot(ph, cbar=True, **heatmap_args) plt.title('p-vals') fname = 'SP_' + self.metric + '.pdf' plt.savefig(fname) plt.show()
def allStats(fs): fsa = np.array(fs) print(len(fsa)) # print(fsa.T) res = { 'm_std': [{ "mean": np.mean(f), 'std1': np.std(f) } for f in fsa] # 'friedman': stats.friedmanchisquare(*fsa), # 'nemenyi': sp.posthoc_nemenyi_friedman(fsa.T) } if len(fsa) == 2: res["tvalue"] = stats.ttest_ind(fsa[0], fsa[1], equal_var=False) res["uvalue"] = stats.mannwhitneyu(fsa[0], fsa[1], alternative='greater') elif len(fsa) > 2: res["friedman"] = stats.friedmanchisquare(*fsa) res["nemenyi"] = sp.posthoc_nemenyi_friedman(fsa.T) return res
# stats analysis of reaction times for recall data rt_recall = rt_main.rt(df, sess1, ans_imm, ans_del, numRo1, worperro, conds, ids, control,recog = False) #analysis of reaction times for recog data rt_recog = rt_main.rt(df, sess1, ans_imm, ans_del, numRo1, worperro, conds, ids, control,recog = True) friedman(diffs_mean) nemeny_p= (sp.posthoc_nemenyi_friedman(rt_recog.iloc[:,[0,1,2]])).round(3) #------------------------------SESSION 2-------------------------------------- print('------------SESSION2----------') #LONGTERM #import data from sess2 df2 = all_import_data.import_df2(sess2.datapath, control) #get long-term results _, lt1, lt2, ids2_ordered, lt_res_mean, lt_res_std = lt_main.longterm( df2, sess1, sols, conds, worperro)
print( friedmanchisquare(split_data[curr_data]['DLIS(False)'], split_data[curr_data]['JW-OS'], split_data[curr_data]['DLCS'], split_data[curr_data]['Dummy'], split_data[curr_data]['RandomFalse'], split_data[curr_data]['Random'], split_data[curr_data]['DLIS(True)'])) # pvalue matrix for pairwise test # Indicates that for all metrics Dummy and DLIS(True) are not # significantly different. Neither are the other solvers between # them. print('\nPairwise comparison:') print('\tDLIS(F)\tJW-OS\tDLCS\tDummy\tRandomF\tRandom\tDLIS(T)') print(posthoc_nemenyi_friedman(split_data[curr_data].values)) # Test whether there is a significant difference between Dummy and DLIS(True). #todo: this part is flawed '''for curr_data in split_data: if False: split_data[curr_data].boxplot(column=['Dummy', 'DLIS(True)']) plt.title(curr_data) plt.ylim(0, 40) plt.show() print('\nWilcox test for Dummy against DLIS(True) for ' + curr_data + ' counts: ') print('means: randomfalse: ' + str(split_data[curr_data]['RandomFalse'].mean()) + '\t dlis(true): ' +str(split_data[curr_data]['DLIS(True)'].mean())) one=np.array(split_data[curr_data]['RandomFalse']).flatten() two=np.array(split_data[curr_data]['DLIS(True)']).squeeze() print(wilcoxon(np.array(split_data[curr_data]['RandomFalse']), np.array(split_data[curr_data]['DLIS(True)'])))'''
# -*- coding: utf-8 -*- """ Created on Wed Nov 28 23:54:01 2018 @author: Delgado """ import scikit_posthocs as sp import pandas as pd import numpy as np x = np.array([[79.52, 92.06, 79.59], [43.38, 54.54, 46.82], [79.43, 88.60, 79.57]]) sol = sp.posthoc_nemenyi_friedman(x)
from nonparametric_tests import friedman_aligned_ranks_test as ft import Orange data_MAE_df = pd.DataFrame(data_MAE, columns=all_methods) print('\nFriedman Test MAE:') #print(ss.friedmanchisquare(*data_MAE.T)) #print(' ') t, p, ranks_mae, piv_mae = ft(data_MAE[:, 0], data_MAE[:, 1], data_MAE[:, 2], data_MAE[:, 3], data_MAE[:, 4], data_MAE[:, 5], data_MAE[:, 6], data_MAE[:, 7]) avranksMAE = list(np.divide(ranks_mae, n_datasets)) print('statistic: ' + str(t)) print('pvalue: ' + str(p)) print(' ') pc = sp.posthoc_nemenyi_friedman(data_MAE_df) cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef'] heatmap_args = { 'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3] } plt.figure() sp.sign_plot(pc, **heatmap_args) plt.title('Nemenyi Test MAE') data_CC_df = pd.DataFrame(data_CC, columns=all_methods)
def main(): PART_NUMBER = 0 dataset_paths = [ os.path.join(CLASS_DBS_PATH, dataset_name) for dataset_name in sorted(os.listdir(CLASS_DBS_PATH)) ] # [("db_name", read_cvs)] raw_dbs = [(os.path.basename(dataset_path), pd.read_csv(dataset_path)) for dataset_path in dataset_paths] # [("db_name", X, y)] raw_dbs = [(raw_db[0], \ raw_db[1].loc[:, raw_db[1].columns != raw_db[1].columns[-1]], \ raw_db[1].loc[:, raw_db[1].columns[-1]]) \ for raw_db in raw_dbs] raw_dbs = sorted(raw_dbs, key=lambda x: len(x[1])) # sort by db length if len( sys.argv ) > 1: # For distributed training of multiple dbs over multiple servers num_parts = int(sys.argv[1]) curr_part = int(sys.argv[2]) assert curr_part <= num_parts assert curr_part >= 1 PART_NUMBER = curr_part print("working on dbs %s" % str(list(range(curr_part - 1, len(raw_dbs), num_parts)))) raw_dbs = [ raw_dbs[i] for i in range(curr_part - 1, len(raw_dbs), num_parts) ] preprocessing = DelayedColumnTransformer([(np.object, [ SimpleImputer(strategy='constant'), OneHotEncoder(handle_unknown='ignore') ]), (np.number, [SimpleImputer(strategy='mean'), VarianceThreshold(0.0)])]) eval_metric = balanced_accuracy_score kf = StratifiedKFold(n_splits=EVAL_FOLDS, random_state=RANDOM_SEED) model = Pipeline(steps=[('model', RBoost() if USE_RBOOST else ELPBoost())]) comp_model = Pipeline(steps=[('model', lgb.LGBMClassifier())]) ova_model = OneVsRestClassifier(model) ova_comp_model = OneVsRestClassifier(comp_model) # {db_name: {our_model: reulsts, compare_model: results}} dbs_results = {} with open(os.path.join(WORKING_DIR, "bad-dbs.txt"), "w") as f: pass os.system('mkdir -p {}'.format(MODELS_DIR)) for db_name, X, y in raw_dbs: dbs_results[db_name] = {} X, y = db_encode(db_name, X, y) N = len(X) * (1 - (1 / EVAL_FOLDS)) # Our Model Hyper-Params model_params = { 'estimator__model__kappa': [1 / 3, 1 / N, 2 / N, 3 / N], 'estimator__model__T': [3, 5, 10], 'estimator__model__reg': [1, 10, 20, 50, 100], 'estimator__model__silent': [True], 'estimator__model__verbose': [False] } fold_num = 1 # list of results per fold folds_results = [] comp_folds_results = [] is_binary = len(y.unique()) == 2 # No special case for binary try: for train_index, test_index in kf.split(X, y): print("{}:{}:Fold_{}".format(datetime.now(), db_name, fold_num)) # --- get fold and preprocess --- # X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y[train_index], y[test_index] invalid_labels = set(y_test.unique()) - set(y_train.unique()) # Introduce new labels, should occur only for outliers due to StratifiedKFold # Main assumption in classification is that all labels are known upfront if len(invalid_labels) > 0: X_train = pd.concat([ X_train, pd.DataFrame( [[np.nan for _ in range(len(X_train.columns))] for _ in range(len(invalid_labels))], columns=X_train.columns) ], ignore_index=True) y_train = y_train.append(pd.Series(list(invalid_labels)), ignore_index=True) X_train, y_train = db_encode(db_name, X_train, y_train) preprocessing.fit(X_train, y_train) X_train = preprocessing.transform(X_train) X_test = preprocessing.transform(X_test) # --- random search --- # cv = RandomizedSearchCV(estimator=ova_model, param_distributions=model_params, scoring=make_scorer(eval_metric), cv=HPT_FOLDS, n_iter=RANDOM_CV_ITER, random_state=RANDOM_SEED) comp_cv = RandomizedSearchCV( estimator=ova_comp_model, param_distributions=comp_model_params, scoring=make_scorer(eval_metric), cv=HPT_FOLDS, n_iter=RANDOM_CV_ITER, random_state=RANDOM_SEED) curr_fold_results = {'fold_num': fold_num} curr_fold_comp_results = {'fold_num': fold_num} # --- measure times - FIT + INFER --- # print("Training our model") curr_fold_results['train_time'], curr_fold_results[ 'infer_time'] = get_time_metrics(cv, X_train, y_train, X_test) print("Finished training our model") print("Training comparison model") curr_fold_comp_results['train_time'], curr_fold_comp_results[ 'infer_time'] = get_time_metrics(comp_cv, X_train, y_train, X_test) print("Finished training comparison model") # --- save trained models --- # model_path = MODELS_DIR + "/model_fold_" + str( fold_num) + "_db_name_" + db_name comp_model_path = MODELS_DIR + "/comp_model_fold_" + str( fold_num) + "_db_name_" + db_name dill.dump(cv.best_estimator_, open(model_path, 'wb')) dill.dump(comp_cv.best_estimator_, open(comp_model_path, 'wb')) # --- register best params --- # best_comp_params = comp_cv.best_params_ best_params = cv.best_params_ curr_fold_comp_results['best_params'] = best_comp_params curr_fold_results['best_params'] = best_params # --- get predictions for MultiRBoost --- # y_test_pred_per_label_scores = cv.predict_proba(X_test) y_test_pred = cv.predict(X_test) train_labels = cv.best_estimator_.classes_ comp_train_labels = comp_cv.best_estimator_.classes_ # can be sorted differently # --- get predictions for LightGBM --- # y_test_pred_comp_per_label_scores = comp_cv.predict_proba( X_test) y_test_pred_comp = comp_cv.predict(X_test) # --- replace nans with uniform - fixes an error in OneVsRest --- # y_test_pred_comp_per_label_scores[np.isnan(y_test_pred_comp_per_label_scores)] = 1.0 / \ y_test_pred_comp_per_label_scores.shape[ 1] y_test_pred_per_label_scores[np.isnan(y_test_pred_per_label_scores)] = 1.0 / \ y_test_pred_per_label_scores.shape[ 1] # metrics applicable in multiclass setting ---accuracy, precision--- # multiclass_metrics_dict = {0: 'accuracy', 1: 'precision'} multiclass_metrics = get_multiclass_metrics( y_test, y_test_pred) multiclass_comp_metrics = get_multiclass_metrics( y_test, y_test_pred_comp) for metric_pos, metric_name in multiclass_metrics_dict.items(): curr_fold_results[metric_name] = multiclass_metrics[ metric_pos] curr_fold_comp_results[ metric_name] = multiclass_comp_metrics[metric_pos] # Metrics only applicable in a binary setting ---fpr, tpr, pr_auc, roc-auc--- # binary_metrics_dict = { 0: 'fpr', 1: 'tpr', 2: 'pr_auc', 3: 'roc_auc' } binary_metrics = get_binary_metrics(y_test, y_test_pred, y_test_pred_per_label_scores, \ train_labels) binary_comp_metrics = get_binary_metrics(y_test, y_test_pred_comp, y_test_pred_comp_per_label_scores, \ comp_train_labels) for metric_pos, metric_name in binary_metrics_dict.items(): curr_fold_results[metric_name] = binary_metrics[metric_pos] curr_fold_comp_results[metric_name] = binary_comp_metrics[ metric_pos] # add the current fold results to the results list folds_results.append(curr_fold_results) comp_folds_results.append(curr_fold_comp_results) fold_num += 1 dbs_results[db_name][OUR_MODEL] = folds_results dbs_results[db_name][COMP_MODEL] = comp_folds_results write_single_db_results(dbs_results[db_name], db_name) except Exception as e: print("ERROR!", e) # catching weird values with open(os.path.join(WORKING_DIR, "bad-dbs.txt"), "a") as f: dbs_results.pop(db_name) f.write("{db_name}: {error}\n".format(db_name=db_name, error=e)) continue print(dbs_results) write_all_results(dbs_results, PART_NUMBER) print("Done writing results in part %d" % PART_NUMBER) # --- Statistical Tests Section --- # # --- Friedman Test --- # models_measures = np.zeros(shape=(len(dbs_results), 2)) for model_idx, model_name in enumerate(MODELS_LIST): for db_idx, db_name in enumerate(dbs_results): models_measures[db_idx][model_idx] = np.average([dbs_results[db_name][model_name][i][STAT_CHOSEN_METRIC] \ for i in range(EVAL_FOLDS)]) stats_per_db = [ models_measures[i, :] for i in range(models_measures.shape[0]) ] p_value = friedmanchisquare(*stats_per_db).pvalue print(p_value) if p_value <= P_THRESH: print("Statistically significant!") post_hoc_res = posthoc_nemenyi_friedman(models_measures) print("nemenyi post-hoc result: {res}".format(res=post_hoc_res)) else: print("Not statistically significant!") # --- Meta Learning Section --- # per_dataset_winner = {} for db_name in dbs_results: our_model_metrics = dbs_results[db_name][OUR_MODEL] comp_model_metrics = dbs_results[db_name][COMP_MODEL] we_win = summarize_metrics(our_model_metrics) >= summarize_metrics( comp_model_metrics) per_dataset_winner[db_name.split('.')[0]] = 1 if we_win else -1 X_raw = pd.read_csv(META_DBS_PATH, header=0, index_col='dataset') X = X_raw.loc[list(per_dataset_winner.keys()), :] y = pd.Series( [per_dataset_winner[db_name] for db_name in per_dataset_winner]) db_names = [db_name for db_name in per_dataset_winner] loo = LeaveOneOut() meta_model_results = {} os.system('mkdir -p {plots_dir}/{inner_dir}'.format( plots_dir=PLOTS_DIR, inner_dir=IMPORTANCE_DIR)) os.system('mkdir -p {plots_dir}/{inner_dir}'.format(plots_dir=PLOTS_DIR, inner_dir=SHAP_DIR)) for train_index, test_index in loo.split(X): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] curr_dataset = X_test.index[0] meta_model = xgb.XGBClassifier(booster='gbtree') meta_model.fit(X_train, y_train) y_pred = meta_model.predict(X_test)[0] meta_model_results[curr_dataset] = y_pred generate_importance(meta_model, db_names[test_index[0]]) generate_shap(meta_model, db_names[test_index[0]], X_test) y_pred = pd.Series( [meta_model_results[db_name] for db_name in meta_model_results]) print("Meta Model Accuracy: %f" % accuracy_score(y, y_pred)) meta_results = pd.DataFrame(data={ 'db_name': X.index, 'y_true': y, 'y_pred': y_pred }).set_index('db_name') meta_results.to_csv(os.path.join('/kaggle/working', 'meta-results.csv')) meta_results.describe().to_csv( os.path.join('/kaggle/working', 'meta-results-describe.csv')) dbs_we_won = list(meta_results[meta_results.y_true == 1].index) dbs_we_lost = list(meta_results[meta_results.y_true == -1].index) won_dbs_lengths = [ len(X) for (db_name, X, y) in raw_dbs if db_name.split('.')[0] in dbs_we_won ] lost_dbs_lengths = [ len(X) for (db_name, X, y) in raw_dbs if db_name.split('.')[0] in dbs_we_lost ] print("Average winning dbs length: %f" % np.mean(won_dbs_lengths)) print("Average losing dbs length: %f" % np.mean(lost_dbs_lengths))
'dataset': dataset_names } # will contain one row per dataset and models over columns groups_by_model = df_results.groupby('Algorithm Name') for model_name in model_names: df_model = groups_by_model.get_group(model_name) groups_by_dataset = df_model.groupby('Dataset Name') model_mean = [] for dataset_name in dataset_names: model_mean.append( groups_by_dataset.get_group(dataset_name) [metric].mean()) # average over folds average_results[model_name] = model_mean df_results = pd.DataFrame(average_results) df_results.to_csv('results/average_results.csv', index=False) # save ranks of algorithms (1 is best, |models| is worst) df = df_results.drop(columns='dataset') ranks = rankdata(df.to_numpy(), method='dense', axis=1) df = pd.DataFrame(ranks, columns=df.columns) df['dataset'] = df_results['dataset'] df.to_csv('results/ranks.csv') # friedman and post hoc tests t_stat, p_val = friedmanchisquare(*[df_results[i] for i in model_names]) print('\nfriedman test p-val = %s' % p_val) post_hoc_p_vals = posthoc_nemenyi_friedman( df_results.drop(columns='dataset').to_numpy()) post_hoc_p_vals.columns = model_names print('\npost hoc p-vals:\n%s' % post_hoc_p_vals) post_hoc_p_vals.to_csv('results/post_hoc.csv', index=False)
import pandas as pd from scikit_posthocs import posthoc_nemenyi_friedman from statsmodels.sandbox.stats.multicomp import multipletests data = pd.read_csv("algo_performance.csv") p = posthoc_nemenyi_friedman(data.values) for i in range(0, 5): print("For the", i + 1, "algorithm and a=0.05 with Bonferroni method the result is:", multipletests(p.values[:, i], method='bonferroni', alpha=0.05)[0]) print("For the", i + 1, "algorithm and a=0.1 with Bonferroni method the result is:", multipletests(p.values[:, i], method='bonferroni', alpha=0.1)[0]) print("For the", i + 1, "algorithm and a=0.25 with Bonferroni method the result is:", multipletests(p.values[:, i], method='bonferroni', alpha=0.25)[0]) print()
def process_results(results, groupby='Detector', latex_path=None, fig_path=None, bold_best=False, alpha=0.05, one_fig=True, cd_diagram=True): ''' args: results (dataframe): table of results. groupby (string, list of strings): the name of the column which groups values together path (string or None): the location to write the latex table to (if not None) bold_best (bool): should the best value in each column be bolded? ''' full_results = results.drop(columns='dataset_name').groupby(groupby)#, as_index=False) results_summary = full_results.agg(lambda x: f'{np.mean(x):.2f} ({np.std(x):.2f})') ## Embolden the best values for each column ## results_means = full_results.mean() if bold_best: for col in results_means.columns: if col==groupby or col in groupby or col=='dataset_name': continue if col in ['Precision', 'F1', 'Recall']: idxbf = results_means[col].idxmax() elif col in ['Err-rate', 'Memory', 'Runtime', 'Mean Delay']: idxbf = results_means[col].idxmin() else: raise ValueError(f"Is it good if {col} is high or low?") results_summary.loc[idxbf, col] = '{bf ' + results_summary.loc[idxbf, col] + '}' # Make table latex ready results_latex = results_summary.to_latex()#index=False) # replace ll...l align with lr...r align (l_start, l_fin) = re.search('l(l+)', results_latex).span() l_start += 1 results_latex = results_latex[:l_start ] + 'r'*(l_fin-l_start) + results_latex[l_fin: ] ## Make names more readable ## before_after = { 'Mean Delay': 'Mean Delay', 'Memory': 'Memory (bytes)', 'Runtime': 'Runtime (ms)', 'Err-rate': 'Err-rate (\%)', 'PageHinkley': 'PH', 'FHDDMS.add': 'FHDDMS$_{add}$', 'MDDM.A.100': 'MDDM$_A$', 'MDDM.E.100': 'MDDM$_E$', 'MDDM.G.100': 'MDDM$_G$', 'NO\_DETECTION': 'Null', 'NO_DETECTION': 'Null', 'NAIVE BAYES': 'NB', 'PERCEPTRON': 'PR', 'HOEFFDING TREE': 'HT', 'LEDConceptDrift': 'LED', '\{bf ': '{\\fontseries{b}\\selectfont ', '\}': '}' } before_after.update({ f'HDDM.{x}.test': f'HDDM$_{x}$' for x in 'AW' }) # Replace underscores with spaces in mode names if 'Mode' in results.columns: before_after.update({ x.replace('_', '\_'): ' '.join(x.split('_')).title() for x in results['Mode'].unique() }) # print(before_after) for before, after in before_after.items(): # if after != '}': # after = after.rjust(len(before)) results_latex = results_latex.replace(before, after) # Write the LaTeX table to disk if latex_path: with open(os.path.abspath(latex_path), 'w') as f: f.write(results_latex) print('Writing LaTeX table to', latex_path) ## CD DIAGRAMS ## if not cd_diagram: return results_latex if one_fig: nfigs = len(results_summary.columns)# if results not in ['Detector', 'dataset_name']) nfigs = nfigs+1 if nfigs%2==0 else nfigs nrows = nfigs // 2 ncols = 2 fig_i = 1 width=10 height=4 fig = plt.figure(figsize=(width*2+1, height*nrows+1)) fig.set_facecolor('white') # change names of detectors according to before_after dictionary results.loc[:, 'Detector'] = results.Detector.map(before_after).fillna(results['Detector']) for col in results_means.columns: # print(f'Processing {col}') # Figure out if the plot should be reversed or not if col==groupby or col in groupby or col=='dataset_name': continue if col in ['Precision', 'F1', 'Recall']: reverse=True elif col in ['Err-rate', 'Memory', 'Runtime', 'Mean Delay']: reverse=False else: raise ValueError(f"Is it good if {col} is high or low?") # Convert the column data into matrix form dets = results.Detector.unique() dsets = results.dataset_name.unique() data = [] for dset in dsets: row = [] for det in dets: # print(dset) x = list(results[(results['Detector']==det) & (results['dataset_name']==dset)][col])[0] row.append(x) data.append(row) data = np.array(data) # Replace all the NaNs in the data with zeros data = np.nan_to_num(data) # Perform Nemenyi-Friedman test nem = sp.posthoc_nemenyi_friedman(data) # print('Post-hocs computed.') # Put p-values in a form that the cd-diagram code can use p_vals = [] for i, det1 in enumerate(dets): for j, det2 in enumerate(dets[i+1: ]): p_val = nem[i][j+i+1] p_vals.append(( det1, det2, p_val, p_val<alpha )) # Set span of CD-diagram and compute average values or average rank lowv, highv = None, None if col in ['Precision', 'Recall', 'F1']: # , 'Err-rate' lowv, highv = 0, 1 average_vals = results.groupby('Detector').mean()[col] if col=='Err-rate': lowv, highv = 0, 100 else: # Compute average rank average_vals = pd.DataFrame(columns=['Detector', col]) average_vals[col] = -average_vals[col] for dset in results.dataset_name: results_i = results[ results['dataset_name']==dset ] # print(col) # print(results_i[['Detector', col]]) results_i.loc[:, col] = results_i[col].rank(ascending=True) # print(results_i[['Detector', col]]) average_vals = average_vals.append(results_i[['Detector', col]]) # break # sys.exit() # print(col) # print(average_vals) average_vals = average_vals.groupby('Detector').mean()[col] # print(average_vals) # print('Average vals computed.') # Put the average values in a form that the cd-diagram code can use average_vals = average_vals.sort_values() if reverse: average_vals = average_vals[::-1] # minv = average_vals.min() # maxv = average_vals.max() # Plot the cd diagram if one_fig: ax = fig.add_subplot(nrows, ncols, fig_i) fig_i += 1 else: ax = None graph_ranks( average_vals.values, average_vals.keys(), p_vals, cd=None, reverse=reverse, textspace=1, labels=False, highv=highv, lowv=lowv, ax=ax, width=width, height=height, # highv = int(maxv + (maxv-minv)*0.1), # lowv = int(minv - (maxv-minv)*0.1) ) font = {'family': 'sans-serif', 'color': 'black', 'weight': 'normal', 'size': 22, } ax.set_title(col,fontdict=font, x=0.5, y=0.95) # 0.9 if not one_fig: # fig_path = os.path.abspath(path)+f"-{col.replace(' ', '_')}.pdf" plt.savefig(fig_path,bbox_inches='tight') # print(f'Completed plot for {col}') if one_fig: # fig_path = os.path.abspath(path)+".pdf" # plt.show() print('Writing cd diagrams to', fig_path) plt.savefig(fig_path, bbox_inches='tight') return results_latex
for system_order in ['finite', 'infinite']: print(system_order.upper()) for eval in range(len(function_evals_of_interest)): algorithms_at_fes = [] for algorithm in ['sa', 'acfsa', 'pso', 'aiwpso', 'acor', 'baacor']: print(algorithm) # Load test costs of a given metaheuristic for a given system, considering some number of objective function evaluations base_filename = './results/' + algorithm + '_' + system_order test_costs_mat = np.load(base_filename + '_test_costs.npy') test_costs_of_interest = test_costs_mat[:, evals_mask] costs_fe = test_costs_of_interest[:, eval] algorithms_at_fes.append(list(costs_fe)) print( str(function_evals_of_interest[eval]) + ': \t' + str(np.mean(costs_fe))) algorithms_at_fes = np.array(algorithms_at_fes) print('\n Statistical significance') print(np.shape(algorithms_at_fes)) print('Friedman p-val = ' + str(scipy.stats.friedmanchisquare(*algorithms_at_fes)[1]) + '\n\n') nm_posthoc = sp.posthoc_nemenyi_friedman(algorithms_at_fes.T) plt.figure() sp.sign_plot(nm_posthoc, **heatmap_args) plt.show() print('\n')
def nemenyi(): n = len(models) size = int(math.factorial(n) / (math.factorial(n - 2) * math.factorial(2))) print(size) nemenyi_results = {} for dataset in data1.keys(): print(dataset) results1 = data1[dataset] results5 = data5[dataset] results10 = data10[dataset] results20 = data20[dataset] nemenyi_results[dataset] = np.zeros(shape=(size, 5), dtype=object) index = 0 matrix1 = np.zeros((30, 7), dtype=float) matrix5 = np.zeros((30, 7), dtype=float) matrix10 = np.zeros((30, 7), dtype=float) matrix20 = np.zeros((30, 7), dtype=float) res1 = None res5 = None res10 = None res20 = None for i, model in enumerate(models): matrix1[:, i] = results1[model] matrix5[:, i] = results5[model] matrix10[:, i] = results10[model] matrix20[:, i] = results20[model] res1 = sp.posthoc_nemenyi_friedman(matrix1) res5 = sp.posthoc_nemenyi_friedman(matrix5) res10 = sp.posthoc_nemenyi_friedman(matrix10) res20 = sp.posthoc_nemenyi_friedman(matrix20) col = 1 for row in range(res1.shape[0]): for m in range(len(models) - col): p1, p5, p10, p20 = None, None, None, None p1 = round(res1.iloc[row, col + m], 4) p5 = round(res5.iloc[row, col + m], 4) p10 = round(res10.iloc[row, col + m], 4) p20 = round(res20.iloc[row, col + m], 4) print("\\hline") if (p1 < 0.05): p1 = "\\textit{%f}" % (p1) else: p1 = "%f" % (p1) if (p5 < 0.05): p5 = "\\textit{%f}" % (p5) else: p5 = "%f" % (p5) if (p10 < 0.05): p10 = "\\textit{%f}" % (p10) else: p10 = "%f" % (p10) if (p20 < 0.05): p20 = "\\textit{%f}" % (p20) else: p20 = "%f" % (p20) comp = "%s vs %s" % (models[row], models[col + m]) nemenyi_results[dataset][index, 0] = comp nemenyi_results[dataset][index, 1] = p1 nemenyi_results[dataset][index, 2] = p5 nemenyi_results[dataset][index, 3] = p10 nemenyi_results[dataset][index, 4] = p20 index += 1 print("\\textbf{%s vs %s} & %s & %s & %s & %s \\\\" % (models[row], models[col + m], p1, p5, p10, p20)) col += 1 return nemenyi_results
filename_, file_extension = os.path.splitext(filename) if file_extension == ".xlsx": data = pd.read_excel(dirname+'\\'+filename,usecols=['dataset_name', 'algorithm_name', 'roc_auc']) else: data = pd.read_csv(dirname+'\\'+filename, usecols=['dataset_name', 'algorithm_name', 'roc_auc']) # average auc for each dataset avg_auc = data.groupby(['dataset_name', 'algorithm_name'], as_index= False).mean() res_df = res_df.append(avg_auc) res_df.reset_index(inplace=True, drop=True) # get all datasets names with results from all four algorithms dataset_names = res_df.groupby('dataset_name', as_index = False).count() dataset_names = list(dataset_names[dataset_names['algorithm_name'] == 4]['dataset_name']) # filter result to contain only datasets that are in dataset_names res_df = res_df[res_df['dataset_name'].apply(lambda x: x in dataset_names)] alog_names = list(res_df['algorithm_name'].unique()) # Friedman test stat, p = stats.friedmanchisquare(res_df[res_df['algorithm_name'] == alog_names[0]].sort_values(by='dataset_name')['roc_auc'], res_df[res_df['algorithm_name'] == alog_names[1]].sort_values(by='dataset_name')['roc_auc'], res_df[res_df['algorithm_name'] == alog_names[2]].sort_values(by='dataset_name')['roc_auc'], res_df[res_df['algorithm_name'] == alog_names[3]].sort_values(by='dataset_name')['roc_auc']) # interpret results alpha = 0.05 print('Statistics=%.3f, p=%.3f' % (stat, p)) if(p < alpha): print('null hypothesis rejected') # perform post-hoc test with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(sp.posthoc_nemenyi_friedman(res_df, y_col='roc_auc',block_col='dataset_name', group_col='algorithm_name',melted=True))
pip install scikit-posthocs import pandas as pd from scipy.stats import friedmanchisquare import scikit_posthocs as sp # Teste de Friedman # Primeiro criamos um dataframe com as views 0: fac, 1: fou, 2: kar e a linha 3 com a regra da soma. # As colunas são bg: Bayesiano Gaussiano, bkv: Bayesiano K-vizinhos e Par: Parzen tbacc = pd.DataFrame.from_dict({'view': {0: 0, 1: 1, 2: 2, 3: 3}, 'bg': {0: 0.805, 1: 0.58, 2: 0.735, 3: 0.815}, 'bkv': {0: 0.79, 1: 0.63, 2: 0.775, 3: 0.855}, 'par': {0: 0.835, 1: 0.615, 2: 0.82, 3: 0.835}}) print(tbacc) # Aplicamos o teste de Friedman sobre as acurácias dos três classificadores result = friedmanchisquare(tbacc["bg"], tbacc["bkv"], tbacc["par"]) print(result) # Em seguida, em caso de rejeição da hipótese Nula, fazemos o teste posthoc nemenyi dados = pd.DataFrame.from_dict({'blocks': {0: 0, 1: 1, 2: 2, 3: 3, 4: 0, 5: 1, 6: 2, 7: 3, 8: 0, 9: 1, 10: 2, 11: 3}, 'groups': {0: 0, 1: 0, 2: 0, 3: 0, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 2, 10: 2, 11: 2},'y': {0: 0.805, 1: 0.58, 2: 0.735, 3: 0.815, 4: 0.79, 5: 0.63, 6: 0.775, 7: 0.855, 8: 0.835, 9: 0.615, 10: 0.820, 11: 0.835}}) print(dados) sp.posthoc_nemenyi_friedman(dados, y_col='y', block_col='blocks', group_col='groups', melted= True)
t = round(t, 2) p = round(p, 2) res.append([t, p]) return res friedman_l = [] for i in range(len(all_data)): res2 = friedman(all_data[i]) friedman_l.extend(res2) friedman_l = pd.DataFrame(friedman_l) friedman_recog = friedman(rt_recog) rr_normality.qq_plot(rt_recog, recall=True) nemeny_recog = sp.posthoc_nemenyi_friedman(rt_recog.iloc[:, [0, 1, 2]]) print('-----------', friedman_recog, nemeny_recog, '-------') def anova(data): data = pd.melt(data, id_vars='sub_id', var_name='cond', value_name='performance') # #perform anova anovarm = AnovaRM(data, 'performance', 'sub_id', within=['cond']) res = anovarm.fit() #rounded p value
def main(dataset, alpha=.05): os.chdir(os.path.dirname(os.path.realpath(__file__)) + '/../') directory = os.path.dirname(os.path.realpath(__file__)) + '/' + dataset + '/info/' files = glob.glob(directory + '*.json') BA_AUCs = {} BA_10s = {} print(os.getcwd()) for file in files: fs_class = file.split('.')[-2].split('_')[-1] with open(file, 'r') as outfile: stats = json.load(outfile) n_features = np.asarray(stats['classification']['n_features']) for key in ['BA', 'svc_BA', 'model_BA']: if key not in stats['classification']: continue BA_key = fs_class + '_' + key BA = np.asarray(stats['classification'][key]).T BA_AUC = (.5 * (BA[:, 1:] + BA[:, :-1]) * (n_features[1:] - n_features[:-1]) / (n_features[-1] - n_features[0])).sum(axis=-1) BA_AUCs[BA_key] = BA_AUC BA_10s[BA_key] = BA[:, 0] print('method : ', fs_class) print('BA', key, ' : ', BA.mean(axis=0), '+-', BA.std(axis=0)) print('BA_10', key, ' : ', BA_10s[BA_key].mean(axis=0), '+-', BA_10s[BA_key].std(axis=0)) print('BA_AUC', key, ' : ', BA_AUC.mean(axis=0), '+-', BA_AUC.std(axis=0)) for t, BA_dict in enumerate([BA_10s, BA_AUCs]): print('BA 10 features' if t == 0 else 'BA_AUC') keys = list(BA_dict.keys()) # wilcoxon_matrix = np.zeros((len(keys), len(keys))) # for i in range(len(keys) - 1): # BA_i = BA_dict[keys[i]] # for j in range(i+1, len(keys)): # BA_j = BA_dict[keys[j]] # t, p_value = wilcoxon(BA_i, BA_j) # if p_value < alpha: # if BA_i.mean() > BA_j.mean(): # wilcoxon_matrix[i, j] = 1 # wilcoxon_matrix[j, i] = -1 # else: # wilcoxon_matrix[i, j] = -1 # wilcoxon_matrix[j, i] = 1 # # # print(keys) # # print(wilcoxon_matrix) # # min_wilkoxon = wilcoxon_matrix.min(axis=-1) # max_wilkoxon = wilcoxon_matrix.max(axis=-1) # best_methods = np.where((min_wilkoxon + 1) * max_wilkoxon > 0)[0] # print('wilcoxon best methods : ', np.asarray(keys)[best_methods]) auc = tuple(list(BA_dict.values())) _, p_value = friedmanchisquare(*auc) print('friedman p_value : ', p_value) nemenyi = sp.posthoc_nemenyi_friedman(np.array(auc).T).values nemenyi_matrix = np.zeros((len(keys), len(keys))) for i in range(len(keys) - 1): BA_i = BA_dict[keys[i]] for j in range(i + 1, len(keys)): BA_j = BA_dict[keys[j]] p_value = nemenyi[i,j] if p_value < alpha: if BA_i.mean() > BA_j.mean(): nemenyi_matrix[i, j] = 1 nemenyi_matrix[j, i] = -1 else: nemenyi_matrix[i, j] = -1 nemenyi_matrix[j, i] = 1 min_nemenyi = nemenyi_matrix.min(axis=-1) max_nemenyi = nemenyi_matrix.max(axis=-1) best_methods = np.where((min_nemenyi + 1) * max_nemenyi > 0)[0] print('nemenyi best methods : ', np.asarray(keys)[best_methods])