def run_stats(input_df): """Run Kruskal-Wallis H test. This is analogous to 1 way ANOVA but for non-parametric applications. The conover test is used for post-hoc testing to determine relationship between variables. NOTE that the post hoc tests should only be used when there is a significant result of the omnibus test.""" #deal with cases where all vals in a col are nan input_df = input_df.dropna(axis=1, how='all') #set inf to nan input_df = input_df.replace(np.inf, np.nan) if input_df.isnull().all().all(): return None #reformat the df cols into arrays to pass to the stats func data = [ input_df[column].to_numpy() for column in input_df.columns if not column == 'huc8' ] #run the kruskal-wallis H, p = stats.kruskal(*data, nan_policy='omit') #print(H,p) try: #run the post-hoc test #conover = sp.posthoc_conover([input_df.dropna().iloc[:,0].values,input_df.dropna().iloc[:,1].values,input_df.dropna().iloc[:,2].values,input_df.dropna().iloc[:,3].values],p_adjust='holm') conover = sp.posthoc_conover(data, p_adjust='holm') conover.columns = input_df.columns conover.index = input_df.columns return H, p, conover except Exception as e: print('Error is: ', e)
def sign_barplot(df, val_col, group_col, test="HSD"): if test == "HSD": result_df = tukey_hsd(df, val_col, group_col) if test == "tukey": result_df = sp.posthoc_tukey(df, val_col, group_col) if test == "ttest": result_df = sp.posthoc_ttest(df, val_col, group_col) if test == "scheffe": result_df = sp.posthoc_scheffe(df, val_col, group_col) if test == "dscf": result_df = sp.posthoc_dscf(df, val_col, group_col) if test == "conover": result_df = sp.posthoc_conover(df, val_col, group_col) #マッピングのプロファイル fig, ax = plt.subplots(1, 2, figsize=(10, 6)) cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef'] heatmap_args = { 'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True } sp.sign_plot(result_df, ax=ax[1], **heatmap_args) #検定結果を描画 sns.barplot(data=df, x=group_col, y=val_col, capsize=0.1, ax=ax[0]) #使ったデータを描画 plt.show()
def run_kruskall_posthoc(x, y, df): # select only parts of df that are needed for analysis and remove outliers df = remove_outliers(x, y, df) # Post warning if df is less than 5 samples if len(df) < 5: print( 'Warning: Sample Size Smaller Than 5. Kruskall Wallace Test Value Suspect' ) # Reorded dataframes and run sruskall wallace test groups = [] group_names = df[x].drop_duplicates() group_names = np.array(group_names.values) for i in range(len(group_names)): indv_group = df[df[x] == group_names[i]][y].values indv_group = indv_group.tolist() groups.append(indv_group) # run kruskall-wallis test k = st.mstats.kruskalwallis(*groups) # if kruskall-wallace test is satisfied then run post hoc tests, and return results as a dataframe # run post hoc dunn following kruskall wallace test # this returns p values of each group with the diagonals being -1 beucase # you're comparing the same group x = sp.posthoc_conover(df, val_col=y, group_col=x, p_adjust='holm') # Store the result of comparisons of different groups in dataframe x['Measurement'] = [y] * len(x) x['Comparison Group'] = x.index.values x = x.reset_index() x['Krusal-Wallis p value'] = [k[1]] * len(x) return x
def do_stats_stuff(m): ll = list(m.values()) args = [zz for zz in ll] s = stats.kruskal(*args) print("Kruskal Result: " + str(s)) lt = [] for k, v in m.items(): for v1 in v: lt.append((k, v1)) df = pd.DataFrame(lt, columns=['label', 'ratio']) f = posthocs.posthoc_conover(df, sort=True, p_adjust='bonferroni', group_col='label', val_col='ratio') print("Dunn's Result") print(str(f)) for k, v in m.items(): if 'External To External' in k: q3 = np.percentile(v, 25, interpolation='midpoint') iqr = (1.5 * stats.iqr(v, interpolation='midpoint')) + q3 outlier = [] for v1 in v: if v1 >= iqr: outlier.append(v1) if len(outlier) > 0: print(outlier) return s
def stats_tests(df, metrics, groups, group_by): p_value = 0.05 results = {} for metric in metrics: normality = {} for differ, ids in groups.items(): normal = is_normal(df.loc[ids, metric].values, p_value) normality[differ] = normal # get an array pr differ for selected metric data_for_metric = [df.loc[ids, metric].values for ids in groups.values()] #h0: samples from same distribution from_same_distribution = kruskal(data_for_metric, p_value) conover_result = sp.posthoc_conover(df, val_col=metric, group_col=group_by, p_adjust='holm') conover_arr = [] for m1 in groups: for m2 in groups: if m1 != m2: res = conover_result.loc[m1, m2] conover_arr.append(res < p_value) results[metric] = { 'normality': normality, 'from_same_distribution': from_same_distribution, 'post_hoc': conover_result if False in conover_arr else None, } return results
def get_significant_pairs(self, df, metric): pairwise_comparisons = sp.posthoc_conover(df, val_col=metric, group_col='condition', p_adjust='holm') # embed() # TO DO: Wilcoxon won't work for mode switches because not truly paired test (conditions have different lengths) # pairwise_comparisons = sp.posthoc_wilcoxon(df, val_col=metric, group_col='condition', p_adjust='holm') groups = pairwise_comparisons.keys().to_list() combinations = list(itertools.combinations( groups, 2)) # possible combinations for pairwise comparison pairs = [] p_values = [] # get pairs for x: for i in range(len(combinations)): if pairwise_comparisons.loc[ combinations[i][0], combinations[i] [1]] <= self.alpha: # if signifcane between the two pairs is alot, add position pairs.append([ self.label_to_plot_pos[combinations[i][0]], self.label_to_plot_pos[combinations[i][1]] ]) p_values.append(pairwise_comparisons.loc[combinations[i][0], combinations[i][1]]) return pairs, p_values
def get_significant_pairs(self, df, metric, label_to_plot_pos): df["trial"] = df["condition"]+" "+df["block"] pairwise_comparisons = sp.posthoc_conover(df, val_col=metric, group_col='trial', p_adjust='holm') # TO DO: Wilcoxon won't work for mode switches because not truly paired test (conditions have different lengths) # pairwise_comparisons = sp.posthoc_wilcoxon(df, val_col=metric, group_col='condition', p_adjust='holm') groups = pairwise_comparisons.keys().to_list() combinations = list(itertools.combinations(groups, 2)) # possible combinations for pairwise comparison combinations = [('Corrective First', 'Corrective Second'),('Filtered First', 'Filtered Second'),('No Assistance First', 'No Assistance Second')] pairs = [] p_values = [] # get pairs for x: for i in range(len(combinations)): if pairwise_comparisons.loc[combinations[i][0], combinations[i][1]] <= self.alpha: # if signifcane between the two pairs is alot, add position pairs.append([label_to_plot_pos[combinations[i][0]], label_to_plot_pos[combinations[i][1]]]) p_values.append(pairwise_comparisons.loc[combinations[i][0], combinations[i][1]]) return pairs, p_values
def compare_values(values_to_compare): pvals = [] for value in values_to_compare: groups = [ avg_props[avg_props['group'] == group][value] for group in groups_to_compare ] statistic, pval = stats.kruskal(*groups) pvals.append(pval) adj_pvals = multipletests(pvals, alpha=0.05, method='holm')[1] for idx, value in enumerate(values_to_compare): name = translation[value] if type(value) is int else value print("Comparing {}".format(name)) for group in groups_to_compare: group = avg_props[avg_props['group'] == group] print("{}: {} (+/- {})".format(group['group'][0], group[value].mean(), group[value].std())) print("H-test adjusted p-value: {}".format(adj_pvals[idx])) print() opt = pd.get_option('display.float_format') pd.set_option('display.float_format', '{:.3g}'.format) print( sp.posthoc_conover(avg_props, val_col=value, group_col='group', p_adjust='holm')) pd.set_option('display.float_format', opt) print() if not type(value) is int: plot_health(avg_props, value)
def kruskal_posthoc_tests(benchmark_snapshot_df): """Returns p-value tables for various Kruskal posthoc tests. Results should considered only if Kruskal test rejects null hypothesis. """ common_args = { 'a': benchmark_snapshot_df, 'group_col': 'fuzzer', 'val_col': 'edges_covered', 'sort': True } p_adjust = 'holm' posthoc_tests = {} posthoc_tests['mann_whitney'] = sp.posthoc_mannwhitney(**common_args, p_adjust=p_adjust) posthoc_tests['conover'] = sp.posthoc_conover(**common_args, p_adjust=p_adjust) posthoc_tests['wilcoxon'] = sp.posthoc_wilcoxon(**common_args, p_adjust=p_adjust) posthoc_tests['dunn'] = sp.posthoc_dunn(**common_args, p_adjust=p_adjust) posthoc_tests['nemenyi'] = sp.posthoc_nemenyi(**common_args) return posthoc_tests
try: f_value, p_value = stats.kruskal(byGeneticDiversity0FR0, byGeneticDiversity0FR1, byGeneticDiversity0FR2, byGeneticDiversity0FR3, byGeneticDiversity0FR4) if (p_value <= 0.1): numberOfComparisonsp1 += 1 if (p_value <= 0.05): numberOfComparisonsp05 += 1 if (p_value <= 0.01): numberOfComparisonsp01 += 1 print("KW for failures in column " + str(columnName) + " RT " + str(i) + "*** " + str(p_value)) try: pvals = sp.posthoc_conover(ph0, p_adjust='holm') truth = np.logical_and(pvals <= 0.1, pvals >= 0) if (np.any(truth)): print("significant comparison:") print(pvals) except Exception as e: print('Could not compute posthoc conover: ' + str(e)) except: print("") try: f_value, p_value = stats.kruskal(byGeneticDiversity1FR0, byGeneticDiversity1FR1, byGeneticDiversity1FR2, byGeneticDiversity1FR3, byGeneticDiversity1FR4) if (p_value <= 0.1):
cond = (sens_perf['decision'] == decision) & (sens_perf['auction'] == auc) sens_perf.loc[cond, 'rank'] = sens_perf.loc[cond, 'delta'].rank(method='average', ascending=False) # %% ANOVA and posthoc tests anova_perf = {} for decision in decision_types: for auc in auction_types: data = [ delta_dict_perf[auc, decision][x].values for x in delta_perf['names'] ] H, p = ss.kruskal(*data) df = pd.melt(delta_dict_perf[auc, decision], id_vars=[], value_vars=delta_dict_perf[auc, decision].columns) ph = sp.posthoc_conover(df, val_col='value', group_col='variable', p_adjust='holm') anova_perf[auc, decision] = {'anova_p': p, 'posthoc_matrix': ph} # %% Manually correct ranks corrected = pd.read_csv('Results_perf.csv') sens_perf['rank_corrected'] = corrected['rank_corrected'] with open('postprocess_dicts_sens_synth.pkl', 'wb') as f: pickle.dump([corr, sens_perf, delta_dict_perf, anova_perf], f)
div_df = pd.DataFrame(div, columns=[metric]) div_df.to_csv("alphadiversity_" + mname + "_" + var + ".txt", sep="\t") combined_df = pd.concat([div_df, map_df], axis=1).reset_index() combined_df.rename(columns={'index':'sample'}, inplace=True) combined_df = combined_df.sort_values(by=[var]) #Kruskal-Wallis test cat_dict = {} for upar in getattr(combined_df, var).unique(): cat_dict[str(upar)] = list(combined_df[combined_df[var] == upar][metric]) H, p = ss.kruskal(*cat_dict.values()) #Post-hoc test with Benjamini-Hochberg correction con = sp.posthoc_conover(combined_df, val_col=metric, group_col=var, p_adjust = 'fdr_bh') with open("statistics_" + mname + "_" + var + ".txt", "w") as st: st.write("Kruskal-Wallis H-test:\n\n") st.write("H\t" + str(H) + "\n") st.write("p-value\t" + str(p) + "\n\n\n") st.write("Conover post-hoc test with Benjamini/Hochberg correction:\n\n") st.write(con.to_string()) sns.set_style("ticks", {"ytick.major.size": "2.0"}) ax = sns.barplot(data=combined_df, x=var, y=metric, color=col, ci="sd", errwidth=0.6, capsize=0.1) sns.despine(right=True) plt.ylabel(label) plt.savefig(figname, dpi=dpi) end = time.time()
from denn import * from scipy.stats import kruskal import scikit_posthocs as sp import pylustrator pylustrator.start() path = Path('../../data/results/experiment4') # fitness plots no_nn = pd.read_csv(path/'no_nn_mof.csv') nn_normal_rand = pd.read_csv(path/'nn-normal-random_mof.csv') nn_dist_rand = pd.read_csv(path/'nn-distribution-random_mof.csv') nn_dropout_rand= pd.read_csv(path/'nn-dropout-random_mof.csv') labels = ['no_nn', 'nn_normal_rand', 'nn_dist_rand', 'nn_drop_rand'] x=np.array([no_nn.mof, nn_normal_rand.mof, nn_dist_rand.mof,nn_dropout_rand.mof]) stat, p = kruskal(no_nn,nn_normal_rand,nn_dist_rand,nn_dropout_rand) pc=sp.posthoc_conover(x, p_adjust='holm', val_col='values', group_col='groups') print('Statistics=%.3f, p=%.3f' % (stat, p)) print(pc) heatmap_args = {'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]} ax,cbar = sp.sign_plot(pc, **heatmap_args) ax.set_xticklabels(labels) ax.set_yticklabels(labels) plt.show()
df_sort_by = agg_df[(agg_df.metric == "SDR") & (agg_df.target == "vocals")] methods_by_sdr = df_sort_by.score.groupby( df_sort_by.method).median().sort_values().index.tolist() f = plt.figure(figsize=(22, 20)) # resort them by median SDR # Get sorting keys (sorted by median of SDR:vocals score) df_voc = agg_df[(agg_df.target == 'vocals') & (agg_df.metric == "SAR")] targets_by_voc_sdr = df_voc.score.groupby( df_voc.method).median().sort_values().index.tolist() # prepare the pairwise statistics pc_voc = sp.posthoc_conover(df_voc, val_col='score', group_col='method', sort=True) print(pc_voc) f = plt.figure(figsize=(10, 10)) # Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05 cmap = ['1', '#ff2626', '#ffffff', '#fcbdbd', '#ff7272'] heatmap_args = { 'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.90, 0.35, 0.04, 0.3] } sp.sign_plot(pc_voc, **heatmap_args)
# df = pd.DataFrame(rastringin).T # df.to_excel(excel_writer = "C:/Users/Tulio/Desktop/Mestrado/Busca_e_Otimizacao/search_and_optmization/src/utils/test.xlsx") data = a280 # print(statistics.pstdev(data[2])) # Friedman de grupo print(stats.friedmanchisquare(*data)) # Kruskal-Wallis de grupo print(stats.kruskal(*data)) #Teste de Conover baseado em Kruskal-Wallis pc = sp.posthoc_conover(data) #Caso precise mudar os indices e colunas do DataFrame pc.columns = ['GRASP 2-opt', 'GRASP mBUC', 'HC mBUC'] pc.index = ['GRASP 2-opt', 'GRASP mBUC', 'HC mBUC'] print(pc) #Heatmap do Teste de Conover cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef'] heatmap_args = { 'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True,
data.loc[data['Symptom'] == 2, 'Knowledge']) print('Statistics=%.3f \n p=%.4f' % (stat, p)) # In[37]: x = [ data.loc[data['Symptom'] == 0, 'Knowledge'], data.loc[data['Symptom'] == 1, 'Knowledge'], data.loc[data['Symptom'] == 2, 'Knowledge'] ] # In[38]: #post hoc with Conover test pc = sp.posthoc_conover(x) heatmap_args = { 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3] } sp.sign_plot(pc, **heatmap_args) # In[39]: # Post hoc with mann whitney pc2 = sp.posthoc_mannwhitney(x) heatmap_args = { 'linewidths': 0.25,
def compareBases(mainFolders, folders, tp, rep, lastGen, plotType, saveFile): logCol = 1 file = 'evolution' if ((tp == 'evol') or (tp == 'evolBase')): logCol = 1 file = 'evolution' variable = 'Fitness' elif ((tp == 'nModules') or (tp == 'nModulesBase')): logCol = 2 file = 'bestFeatures' variable = 'Number of Modules' elif ((tp == 'brokenConn') or (tp == 'brokenConnBase')): logCol = 11 file = 'meanFeatures' variable = 'Number of Broken Connections' elif ((tp == 'nConn') or (tp == 'nConnBase')): logCol = 19 file = 'bestFeatures' variable = 'Average Connections per Module' dfAll = pd.DataFrame() data = [] for l in range(0, len(mainFolders)): dfBase = pd.DataFrame() for k in range(0, len(folders)): #nGenerations = minGenerationCount(mainFolders[l],folders[k],rep) data.clear() for i in range(0, rep): csv_file = open('./' + mainFolders[l] + '/' + folders[k] + 'xL/' + str(i + 1) + '/log/' + file + '.txt') csv_reader = csv.reader(csv_file) oldRows = list(csv_reader) rows = [] for row in oldRows: rows.append(row[0].split(" - ")) #print(rows) #print(nGenerations) if (lastGen): data.append(float(rows[-1][logCol])) else: line_count = 0 for row in rows: #print(row[logCol]) data.append(float(row[logCol])) line_count = line_count + 1 #if line_count >= nGenerations: # break dfPartial = pd.DataFrame(data, columns=[variable]) dfPartial['Length'] = folders[k] #print(dfPartial) dfBase = dfBase.append(dfPartial, ignore_index=True) #ax1.set_title('Length x'+folders[k]) dfBase['Base'] = mainFolders[l] dfAll = dfAll.append(dfBase, ignore_index=True) #print(dfAll) #dfAll.boxplot(column='Fitness',by='Length',ax=ax1,grid=False,notch=False) #dfAll.groupby('Length',sort=True).boxplot() #print(dfAll) #print([group['Fitness'].values for name,group in dfAll.groupby(['Length','Base'])]) if ((tp != 'nModulesBase') and (tp != 'brokenConnBase') and (tp != 'evolBase') and (tp != 'nConnBase')): print( scp_stats.kruskal(*[ group[variable].values for name, group in dfAll.groupby(['Length', 'Base']) ])) else: print( scp_stats.kruskal(*[ group[variable].values for name, group in dfAll.groupby(['Base']) ])) if ((tp != 'brokenConn') and (tp != 'nModulesBase') and (tp != 'brokenConnBase') and (tp != 'evolBase') and (tp != 'nConnBase')): #Connover postHoc = sp.posthoc_conover([ group[variable].values for name, group in dfAll.groupby(['Length', 'Base']) ]) #print(postHoc) #Mann-Whitney #postHoc = sp.posthoc_mannwhitney([group['Fitness'].values for name,group in dfAll.groupby(['Length','Base'])]) #print(postHoc) heatmap_args = { 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3] } sp.sign_plot(postHoc, **heatmap_args) fig = plt.figure(figsize=(15, 10)) y = variable if ((tp == 'nModulesBase') or (tp == 'brokenConnBase') or (tp == 'evolBase') or (tp == 'nConnBase')): x = 'Base' order = mainFolders if (plotType == 'box'): #ax = sns.boxplot(data=dfAll, x=x, y=y,order=order,showfliers=False) ax = sns.boxplot(data=dfAll, x=x, y=y, order=order) elif (plotType == 'swarm'): ax = sns.swarmplot(data=dfAll, x=x, y=y, order=order) elif (plotType == 'strip'): ax = sns.stripplot(data=dfAll, x=x, y=y, order=order) elif (plotType == 'violin'): ax = sns.violinplot(data=dfAll, x=x, y=y, order=order) else: x = "Length" hue = "Base" order = folders if (plotType == 'box'): #ax = sns.boxplot(data=dfAll, x=x, y=y,order=order,hue = hue,showfliers=False) ax = sns.boxplot(data=dfAll, x=x, y=y, order=order, hue=hue) elif (plotType == 'swarm'): ax = sns.swarmplot(data=dfAll, x=x, y=y, order=order, hue=hue) elif (plotType == 'strip'): ax = sns.stripplot(data=dfAll, x=x, y=y, order=order, hue=hue) elif (plotType == 'violin'): ax = sns.violinplot(data=dfAll, x=x, y=y, order=order, hue=hue) #dfAll.boxplot(column='Fitness',by=['Length','Base'],ax=ax1,grid=False,notch=False) #if(tp=='evol'): #ax.set_ylim(-0.1,11) plt.savefig(saveFile + tp + plotType + '.eps', bbox_inches="tight") plt.show()
lbls = args.labels # Custom labels. else: lbls = [Path(exp_path).name for exp_path in args.experiments_paths] # Default labels. # Shapiro tests. print('Shapiro tests:') for (df, lbl) in zip(dfs,lbls): shapiro_test = stats.shapiro(df['travel_time']) print(f'\t{lbl}: {shapiro_test}') args = [df['travel_time'] for df in dfs] print(f'\nLevene\'s test: {stats.levene(*args)}') print(f'\nANOVA test: {stats.f_oneway(*args)}') data = [] groups = [] for (df, lbl) in zip(dfs, lbls): data.extend(df['travel_time'].tolist()) groups.extend([lbl for _ in range(len(df['travel_time'].tolist()))]) print('\nTukeyHSD:', pairwise_tukeyhsd(data, groups)) # Non-parametric test. print('\nKruskal (non-parametric) test:', stats.kruskal(*args)) # Post-hoc non-parametric comparisons. data = [df['travel_time'].tolist() for df in dfs] print(sp.posthoc_conover(data, p_adjust = 'holm'))
def vary_thresholds(fn, thresholds, cthres): pp = PdfPages('figures/vary_thresholds_gamma={}.pdf'.format(gamma)) years = ["1996", "2001", "2007", "2012"] def ind(yr): return years.index(yr) fig, ax = plt.subplots(1, 1) ax.grid(False) # color-blind spectrum: http://personal.sron.nl/~pault/colourschemes.pdf colors = [ "#88ccee", "#44aa99", "#999933", "#DDCC77", "#CC6677", "#882255", "#AA4499" ] bars = [] rows = [] # Crude sm = 0 hm = {t: {v: [] for v in krange} for t in thresholds} total = {t: {ind(y): 0 for y in years} for t in thresholds} byyear = {t: {ind(y): [] for y in years} for t in thresholds} compy = [(years[k], years[k + 1]) for k in range(len(years) - 1)] def process(tup): x, yr, sample = tup # Run the misfits calculation mf = s.misfits(x, krange, gamma=gamma) return x, mf, yr, sample cached_process = memory.cache(process) for year in tqdm(years): fdata = Parallel(n_jobs=num_cores)( delayed(cached_process)(tup) for tup in read_field_data_year(fn, [year])) for x, mf, yr, sample in fdata: mf = np.array(mf) # Find first drop below threshold tqdm.write("Length:{}. Interesting bases:{}. Misfits: {}".format( len(x), len([xx for xx in x if 1.0 - 1e-6 > xx > 1e-6]), mf)) for thres in thresholds: best = sum(mf > thres) if best >= len(mf): continue best += 1 if best not in hm[thres]: hm[thres][best] = [] byyear[thres][ind(year)] += [best] hm[thres][best] += [ind(year)] total[thres][ind(year)] += 1 of = open('figures/vary_thresholds_gamma={}.txt'.format(gamma), 'w') x = {} bw = {} kruskals = {} conovers = {(a, b): {} for (a, b) in compy} for thres in tqdm(thresholds): x[thres] = [byyear[thres][ind(y)] for y in years] try: kr = sts.kruskal(*x[thres]) kruskals[thres] = kr[1] # p-value of.write("\n{}\n{}\tKruskal-Willis:\n{}\n".format( "*" * 80, thres, kr)) pc = sp.posthoc_conover(x[thres], val_col='values', group_col='groups', p_adjust='fdr_tsbky') for (a, b) in compy: #tqdm.write ("Conovers:\t{}\n".format( (a,b,ind(a),ind(b) ))) #tqdm.write ("Conovers:\t{}\n".format(pc)) #tqdm.write ("Conovers:\t{}\n".format(pc[ind(a)+1][ind(b)+1])) conovers[(a, b)][thres] = pc[ind(a) + 1][ind(b) + 1] of.write("{}\tConover:\n{}\n".format(thres, pc)) except: tqdm.write('Exception with threshold {}, NaNing'.format(thres)) of.write("{}\tKruskal-Willis:\nNaN\n".format(thres)) of.write("{}\tConover:\nNaN\n".format(thres)) kruskals[thres] = float('nan') for (a, b) in compy: conovers[(a, b)][thres] = float('nan') plt.ylabel('$q$-value') plt.xlabel('MOI Misfit Threshold ($T$)') plt.yscale('log') plt.xscale('log') plt.plot(thresholds, [kruskals[t] for t in thresholds], color='orange', linewidth=1.3, alpha=0.7, label='Kruskal-Willis') for i, (a, b) in enumerate(compy): plt.plot(thresholds, [conovers[a, b][t] for t in thresholds], color=colors[i], alpha=0.7, label='Conover-Imam %s vs. %s' % (a, b)) plt.legend(prop={'size': 10}, loc='lower right') plt.axvline(x=cthres, color='k', linestyle='--', linewidth=0.5, label='', alpha=0.8) plt.text(cthres, 1e-5, 'Threshold used', horizontalalignment='left', size='small', color='k', alpha=0.8) plt.axhline(y=0.05, color='b', linestyle='--', linewidth=0.5, label='0.05', alpha=0.7) plt.text(cthres, 0.05, '$q$=0.05', horizontalalignment='right', size='small', color='b', alpha=0.7) pp.savefig() pp.close() of.close()
return pd.pivot_table(df, index=x, columns=c, values=y, aggfunc="count") for i in group: # 输出全部的分组信息 fenbu(i) #方差分析及事后检验 f, p = stats.f_oneway(*args) print(f, p) x = [list(args[1]), list(args[2]), list(args[3])] sp.posthoc_conover(x, group_col=x, val_col=statistics, p_adjust='holm') #独立样本t检验 ttest_group1 = df[df['GHQ分2类(1-很好;2-较差)'] == 1]['GHQ总分'] ttest_group2 = df[df['GHQ分2类(1-很好;2-较差)'] == 2]['GHQ总分'] group_mean = df.groupby('GHQ分2类(1-很好;2-较差)') group_mean['GHQ总分.1'].agg("mean") t, p = stats.ttest_ind(ttest_group1, ttest_group2) print(t, p) #事后检验 x = pd.DataFrame({ "k": [1, 2, 4, 5, 6], "j": [1, 3, 5, 7, 66],
nasa_df[nasa_df.experiment_type == "CONTROL"][item], nasa_df[nasa_df.experiment_type == "BUTTON"][item], nasa_df[nasa_df.experiment_type == "TOUCH"][item], center='median' ) if norm_p1 < 0.05 or norm_p2 < 0.05 or norm_p3 < 0.05 or norm_p4 < 0.05: _, anova_p = stats.friedmanchisquare( nasa_df[nasa_df.experiment_type == "BASELINE"][item], nasa_df[nasa_df.experiment_type == "CONTROL"][item], nasa_df[nasa_df.experiment_type == "BUTTON"][item], nasa_df[nasa_df.experiment_type == "TOUCH"][item], ) print("anova(friedman test)", anova_p) if anova_p < 0.05: print(sp.posthoc_conover(nasa_df, val_col=item, group_col="experiment_type")) else: melted_df = pd.melt(nasa_df, id_vars=["name", "experiment_type"], var_name="type", value_name="rate") aov = stats_anova.AnovaRM(melted_df[melted_df.type == item], "rate", "name", ["experiment_type"]) print("reperted anova: ", aov.fit()) multicomp_result = multicomp.MultiComparison(nasa_df[item], nasa_df.experiment_type) print(multicomp_result.tukeyhsd().summary()) melted_df = pd.melt(nasa_df, id_vars=nasa_df.columns.values[:2], var_name="args", value_name="value") # plot = sns.boxplot(x='args', y="value", hue="experiment_type", data=melted_df,showmeans=True, meanline=True, meanprops={"linestyle":"--", "color":"Red"}) axes = sns.barplot(x='args', y="value", hue="experiment_type", data=melted_df) axes.set_ylim([0, 10]) axes.set_ylabel('Workload Rating', fontsize=15) axes.set_xlabel('Scale', fontsize=15)
df_acc.method).median().sort_values().index.tolist() targets_by_voc_sdr_acc = [ x for x in targets_by_voc_sdr if x in targets_by_acc_sdr ] # get the two sortings df_voc['method'] = df_voc['method'].astype('category', categories=targets_by_voc_sdr, ordered=True) df_acc['method'] = df_acc['method'].astype('category', categories=targets_by_acc_sdr, ordered=True) # prepare the pairwise plots pc_voc = sp.posthoc_conover(df_voc, val_col='score', group_col='method') pc_acc = sp.posthoc_conover(df_acc, val_col='score', group_col='method') f = plt.figure(figsize=(10, 10)) # Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05 cmap = ['1', '#ff2626', '#ffffff', '#fcbdbd', '#ff7272'] heatmap_args = { 'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.90, 0.35, 0.04, 0.3] } sp.sign_plot(pc_voc, **heatmap_args)
# Kruskal–Wallis one-way analysis of variance hstat_quart_lsnum, hpval_quart_lsnum = stats.kruskal( *[df_grouped_quart.get_group(i).ls_num for i in range(4)]) hstat_quart_lsgsenum, hpval_quart_lsgsenum = stats.kruskal( *[df_grouped_quart.get_group(i).ls_gse_num for i in range(4)]) hstat_quart_lsprivnum, hpval_quart_lsprivnum = stats.kruskal( *[df_grouped_quart.get_group(i).ls_priv_num for i in range(4)]) hstat_quart_secnum, hpval_quart_secnum = stats.kruskal( *[df_grouped_quart.get_group(i).sec_num for i in range(4)]) '''NOTE: The Kruskal-Wallis test is to try out stuff. At least one group is dominating the other. As another try-out, let's try a conover test. ''' # Conover test, p-adjust???? cpval_quart_lsnum = sp.posthoc_conover(df, val_col='ls_num', group_col='quart_distance') cpval_quart_lsgsenum = sp.posthoc_conover(df, val_col='ls_gse_num', group_col='quart_distance') cpval_quart_lsprivnum = sp.posthoc_conover(df, val_col='ls_priv_num', group_col='quart_distance') cpval_quart_secnum = sp.posthoc_conover(df, val_col='sec_num', group_col='quart_distance') #------------------------------------------------------------ # Decile grouping #df['dec_distance'] = pd.qcut(df.log_min_distance, q = 10, labels = False)
get_precision(smote), get_precision(smote_data_aug)) recall_stat, recall_p_val = kruskal(get_recall(baseline), get_recall(umce), get_recall(smote), get_recall(smote_data_aug)) f1_stat, f1_p_val = kruskal(get_f1(baseline), get_f1(umce), get_f1(smote), get_f1(smote_data_aug)) print("precision: ", precision_stat, precision_p_val) print("recall: ", recall_stat, recall_p_val) print("f1: ", f1_stat, f1_p_val) print("precision: ") posthoc_precison = posthoc_conover([ get_precision(baseline), get_precision(umce), get_precision(smote), get_precision(smote_data_aug) ]) print(posthoc_precison) print("recall: ") posthoc_recall = posthoc_conover([ get_recall(baseline), get_recall(umce), get_recall(smote), get_recall(smote_data_aug) ]) print(posthoc_recall) print("f1: ") posthoc_f1 = posthoc_conover(
def field_longitudinal(fn, thres): pp = PdfPages('figures/field_longitudinal_gamma={}_thres={}.pdf'.format( gamma, thres)) years = ["1996", "2001", "2007", "2012"] def ind(yr): return years.index(yr) fig, ax = plt.subplots(1, 1) ax.grid(False) # color-blind spectrum: http://personal.sron.nl/~pault/colourschemes.pdf colors = [ "#88ccee", "#44aa99", "#999933", "#DDCC77", "#CC6677", "#882255", "#AA4499" ] colors = colors[0:len(krange)] bars = [] rows = [] # Crude hm = {v: [] for v in krange} sm = 0 total = {ind(y): 0 for y in years} byyear = {ind(y): [] for y in years} def process_vc(tup): x, yr, sample = tup # Run the misfits calculation mf = s.misfits(x, krange, gamma=gamma) sr = {k: s.compute(x, k, gamma=gamma) for k in krange} return x, mf, yr, sample, sr cached_process = memory.cache(process_vc) stf = open('figures/strains.out', 'w') for year in tqdm(years): fdata = Parallel(n_jobs=num_cores)( delayed(cached_process)(tup) for tup in read_field_data_year(fn, [year])) for x, mf, yr, sample, sr in fdata: mf = np.array(mf) # Find first drop below threshold tqdm.write("Length:{}. Interesting bases:{}. Misfits: {}".format( len(x), len([xx for xx in x if 1.0 - 1e-6 > xx > 1e-6]), mf)) best = sum(mf > thres) if best >= len(mf): continue best += 1 if best not in hm: hm[best] = [] byyear[ind(year)] += [best] hm[best] += [ind(year)] total[ind(year)] += 1 # Print out the strain sequences of at least 5% proportion in a sample assert best in sr print(sr[best][1], max(sr[best][1])) stf.write("DOMSTR\t{}\t{:.2f}%\n".format(yr, max(sr[best][1]))) for i, f in enumerate(sr[best][1]): if f >= 0.05: stf.write("STRAIN\t{}\t{}\t{}\t{:.2f}%\t{}\n".format( sample, yr, i, 100.0 * sr[best][1][i], "".join([ "{}".format(int(z)) if not np.isnan(z) else "N" for z in sr[best][0][i] ]))) stf.close() of = open( 'figures/field_longitudinal_gamma={}_thres={}.txt'.format( gamma, thres), 'w') for year in years: of.write("Year %s\t" % year + "\t".join([ "%d=%d" % (v, len(list(filter(lambda y: y == ind(year), hm[v])))) for v in krange ])) of.write("\tAverage (including 5+):\t" + "%2.4f" % (np.mean(byyear[ind(year)])) + "\tAverage (excluding 5):\t" + "%2.4f" % (np.mean([sc for sc in byyear[ind(year)] if sc < 5]))) of.write("\tMedian:\t" + "%2.4f" % (np.median(byyear[ind(year)])) + "\n") m = [hm[v] for v in krange] plt.ylabel('% samples') plt.xlabel('Survey year') plt.xticks(np.arange(len(years)), years) plt.ylim([0, 100]) weights = np.array([[100.0 / float(total[int(y)]) for y in hm[v]] for v in krange]) bins = np.arange(len(years) + 1) - 0.5 hatch = '/' _, _, patches = plt.hist( m, bins=bins, histtype='bar', stacked=True, weights=weights, rwidth=0.5, color=colors, label=[ "%s%d strain%s" % ("=" if v != krange[-1] else "$\geq$", v, "s" if v != krange[0] else "") for v in krange ]) #, hatch=hatch) plt.legend( bbox_to_anchor=(1.04, 0.5), loc="center left", borderaxespad=0, prop={'size': 10}, ) mm = np.array(m) lk = { year: { v: len(list(filter(lambda y: y == ind(year), hm[v]))) for v in krange } for year in years } for j, bc in enumerate(patches): for i, p in enumerate(bc): #l = np.sum(np.array(byyear[i]) == len(patches)-j-1) l = lk[years[i]][krange[j]] if l == 0: continue h1 = p.get_height() print("{} {}".format(p, l)) z = 100.0 * l / float(sum(lk[years[i]].values())) ax.text(p.get_x() + p.get_width() / 2., p.get_y() + h1 / 2., "%d%%" % int(z), ha="center", va="center", color="black", fontsize=12, fontweight="bold") pp.savefig(bbox_inches="tight") pp.close() for y in years: of.write("%s: length %d\n" % (y, len(byyear[ind(y)]))) of.write("{}\n".format(byyear[ind("1996")])) of.write("H1\t{}\t1996 vs 2001:\t{}\n".format( thres, sts.mannwhitneyu(byyear[ind("1996")], byyear[ind("2001")]))) of.write("H2\t{}\t2007 vs 2012:\t{}\n".format( thres, sts.mannwhitneyu(byyear[ind("2007")], byyear[ind("2012")]))) x = [byyear[ind(y)] for y in years] #pc = sp.posthoc_conover(x, val_col='values', group_col='groups', p_adjust='holm') kr = sts.kruskal(*x) of.write("Kruskal-Willis:\n{}\n".format(kr)) pc = sp.posthoc_conover(x, val_col='values', group_col='groups', p_adjust='fdr_tsbky') of.write("Conover:\n{}\n".format(pc)) # Format: diagonal, non-significant, p<0.001, p<0.01, p<0.05 cmap = ['1', '#fb6a4a', '#08306b', '#4292c6', '#c6dbef'] heatmap_args = { 'cmap': cmap, 'linewidths': 0.25, 'linecolor': '0.5', 'clip_on': False, 'square': True, 'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3] } sp.sign_plot(pc, **heatmap_args) of.close()
# In[141]: data = df[(df['Country']=='Slovenia')| (df['Country']=='Denmark')| (df['Country']=='Cyprus')| (df['Country']=='Japan') | (df['Country']=='Switzerland')] df2 = data[['Country','Life expectancy ']] # In[142]: scikit_posthocs.posthoc_conover(a = df2, val_col = 'Life expectancy ', group_col = 'Country') # The Pvalue for countries (Cyprus,Japan) , (Cyprus , Switzerland),(Denmark,Japan),(Denmark,Switzerland),(Japan,Slovenia) and (Slovenia,Switzerland) is less than alpha and thus there is difference in Life expectancies between these countries # ### Test-4 # <b>Test the claim that population depends upon the status and Year</b> # In[143]: df3 = df[['Status', 'Year', 'Population']].dropna() df3 # The null and alternative hypothesis<br>
tracker.write("\nANOVA results: cor %f| p %f\n" % cor) else: cor = stats.kruskal( df['total_score'][(df['conflict'] == "Standoff") & (df['is_kashmir'] == True)], df['total_score'][(df['conflict'] == "Mumbai") & (df['is_kashmir'] == True)], df['total_score'][(df['conflict'] == "Burhan") & (df['is_kashmir'] == True)], df['total_score'][(df['conflict'] == "Non-conflict") & (df['is_kashmir'] == True)]) tracker.write("\nKruskal Wallis results: cor %f| p %f\n" % cor) tracker.write("\nConover Post Hoc Test\n") sp.posthoc_conover(df[df["is_kashmir"] == True], val_col='total_score', group_col='conflict').to_csv(tracker, mode="a") #HYPOTHESIS 3 Pakistan conflict to Pak non conflict tracker.write( "\n\n\n\nHYPOTHESIS 3: Pakistan-related headlines will have more negative sentiment scores on average in conflict periods than Pakistan-related headlines in non-conflict periods\r\n" ) rp.summary_cont(df.groupby(['is_pakistan', 'conflict'])['total_score']).to_csv(tracker, mode="a") levene = stats.levene( df['total_score'][(df['conflict'] == "Standoff") & (df['is_pakistan'] == True)], df['total_score'][(df['conflict'] == "Mumbai")
def posthoc(self, df, x, y): df = df[[x, y]].dropna() p = sp.posthoc_conover(df, val_col=y, group_col=x, p_adjust='fdr_bh') return (p)