def generate_histogram(subgroups_to_sses_to_n_count, tname, file_name): columns_to_data = {'subgroup': [], tname: [], 'count': []} max_count = 0 for subgroup, sses_to_n_count in subgroups_to_sses_to_n_count.items(): for ss, n_count in sses_to_n_count.items(): columns_to_data['subgroup'].append(subgroup) columns_to_data[tname].append(ss) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'subgroup': ro.FactorVector(columns_to_data['subgroup'], levels=ro.StrVector( _sort_subgroup(set(columns_to_data['subgroup'])))), tname: ro.StrVector(columns_to_data[tname]), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) max_count = int(max_count / 1000 * 1000 + 1000) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1200, height=800) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='subgroup', y='count', fill=tname) + \ ggplot2.geom_bar(position="dodge",width=0.8, stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \ ggplot2.theme(**{'legend.text': ggplot2.element_text(size=40)}) + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=40,angle=45)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=40)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), size=6, angle=35, hjust=-0.1, position=ggplot2.position_dodge(width=0.8), vjust=-0.2) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def bargraph_language(cfg, values): r = robjects.r for lang in cfg.languages: times = [] varss = [] probs = [] ses = [] for prob in cfg.problems: for var in cfg.variations: # we use the pretty names to make the varss.append(pretty_varis[var]) probs.append(prob) data = FloatVector(values[prob][var][lang][0]) times.append(r['mean'](data)[0]) t_result = r['t.test'](data, **{ " conf.level": 0.999 }).rx('conf.int')[0] ses.append((t_result[1] - t_result[0]) / 2) r.pdf('bargraph-executiontime-lang-' + lang + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Variation': StrVector(varss), 'Problem': StrVector(probs), 'Time': FloatVector(times), 'SE': FloatVector(ses) }) limits = ggplot2.aes(ymax='Time + SE', ymin='Time - SE') dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (in seconds)")') pp.plot() r['dev.off']()
def bargraph_language (cfg, values): r = robjects.r for lang in cfg.languages: times = [] varss = [] probs = [] ses = [] for prob in cfg.problems: for var in cfg.variations: # we use the pretty names to make the varss.append (pretty_varis [var]) probs.append (prob) data = FloatVector (values[prob][var][lang][0]) times.append (r['mean'] (data)[0]) t_result = r['t.test'] (data, **{" conf.level": 0.999}).rx ('conf.int')[0] ses.append ((t_result[1] - t_result[0])/2) r.pdf ('bargraph-executiontime-lang-' + lang + '.pdf', height=pdf_height (), width=pdf_width ()) df = robjects.DataFrame({'Variation': StrVector (varss), 'Problem': StrVector (probs), 'Time' : FloatVector (times), 'SE' : FloatVector (ses) }) limits = ggplot2.aes (ymax = 'Time + SE', ymin = 'Time - SE') dodge = ggplot2.position_dodge (width=0.9) gp = ggplot2.ggplot (df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Variation') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (in seconds)")') pp.plot () r['dev.off']()
def generate_step3_9_n_count_histogram(place_type_pos_type_to_count, file_name): columns_to_data = {'place': [], 'pos': [], 'count': []} max_count = 0 for place_pos_type, n_count in place_type_pos_type_to_count.items(): place_type, pos_type = place_pos_type.split('_') columns_to_data['place'].append(place_type) columns_to_data['pos'].append(pos_type) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'place': ro.StrVector(columns_to_data['place']), 'pos': ro.StrVector(columns_to_data['pos']), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) if max_count > 1000: max_count = int(max_count / 1000 * 1000 + 1000) else: max_count = int(max_count / 100 * 100 + 100) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1024, height=512) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='pos', y='count', fill='place') + \ ggplot2.geom_bar(position="dodge", stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=35)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=35)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), position=ggplot2.position_dodge(width=0.8), size=10, angle=35, hjust=-0.2, vjust=-0.5) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def rest(): df = q1_median_q3_rep_wide pops = ["pdc", "dc-cd11b", "dc-cd8a"] stats_l = [] for stat, (popa, popb) in product(["Q1", "median", "Q3"], product(pops, pops)): print(stat, popa, popb) popa = "hsc" popb = "pdc" stat = "median" mw_u, pvalue = scipy.stats.mannwhitneyu( [0.8, 0.81, 0.79], [0.4, 0.39, 0.41], # df.query("Population == @popa")[stat].to_numpy(), # df.query("Population == @popb")[stat].to_numpy(), use_continuity=True, alternative="two-sided", ) pvalue stats_l.append([stat, popa, popb, mw_u, pvalue]) stats_df = pd.DataFrame(stats_l).set_axis( ["stat", "popA", "popB", "U", "pvalue"], axis=1) kruskal_format_means = pd.pivot( q1_median_q3_rep_wide.query("Population in @pops"), index="Population", columns="Replicate", values="mean", ) import scikit_posthocs stat, p_value = scipy.stats.kruskal( *[kruskal_format_means.loc[pop].to_numpy() for pop in pops], ) dunn_res_df = scikit_posthocs.posthoc_dunn( kruskal_format_means.to_numpy(), p_adjust='fdr_bh', sort=True, ) stat, pvalue = scipy.stats.f_oneway( *[kruskal_format_means.loc[pop].to_numpy() for pop in pops], ) import statsmodels df = kruskal_format_means.stack().reset_index() kruskal_format_means res = statsmodels.stats.multicomp.pairwise_tukeyhsd( df[0], df['Population'].to_numpy(), alpha=0.05) res.pvalues res.summary() # wilcox.test(c(0.8, 0.79, 0.81), c(0.4, 0.39, 0.41), paired=F, exact=F) plot_pops = ["pdc", "dc-cd8a", "dc-cd11b"] results_dir = "/icgc/dkfzlsdf/analysis/hs_ontogeny/notebook-data/gNs4xcMJscaLLwlt" point_plot_quartiles_png = results_dir + "/point-plot-quartiles.png" q1_median_q3_rep_wide ggplot_data = ( q1_median_q3_rep_long.query("Population in @plot_pops").sort_values( "value", ascending=False, ).groupby(["Population", "stat"]).apply( lambda df: df.assign(group_order=np.arange(1, df.shape[0] + 1)))) g = (gg.ggplot(ggplot_data) + gg.aes_string( x="Population", y="value", group="group_order", color="stat") + gg.geom_point(position=gg.position_dodge(width=0.5), size=1) + mh_rpy2_styling.gg_paper_theme + gg.labs(y='Methylation (%)', x='')) a = 3 rpy2_utils.image_png2(g, (ut.cm(6), ut.cm(6))) ut.save_and_display( g, png_path=point_plot_quartiles_png, # additional_formats=tuple(), height=ut.cm(6), width=ut.cm(6), ) q1_median_q3_rep_wide g = ( gg.ggplot( q1_median_q3_rep_wide.query("Population in @plot_pops").assign( sample=lambda df: df["Population"].astype(str) + df[ "Replicate"].astype(str))) + gg.geom_boxplot( gg.aes_string( x="Population", fill="Population", group="sample", lower="Q1", upper="Q3", middle="median", ymin="min1", ymax="max99", # position=gg.position_dodge(width=0.5), ), stat="identity", ) # + mh_rpy2_styling.gg_paper_theme + gg.theme(axis_text_x=gg.element_text(angle=90, hjust=1)) + gg.scale_fill_brewer(guide=False)) a = 3 ut.save_and_display( g, png_path=point_plot_quartiles_png, additional_formats=tuple(), height=ut.cm(6), width=ut.cm(7), ) # image_png2(g, (ut.cm(12), ut.cm(12))) beta_values.loc[:, ("hsc", "1")]
def bargraph_variation(): r = robjects.r for var in variations: # each variation gets plot values = [] # normalized values nvalues = [] langs = [] probs = [] for prob in problems: # aggregate by problems lvalues = [] for lang in languages: # each problem displays a list of language times for that problem langs.append(pretty_langs[lang]) probs.append(prob) value = 0 try: value = result[lang][prob][var] except KeyError: print "Warning: no value for:" print(lang, prob, var) value = 0 # FIXME to account for missing seq-version of Erlang # for the expert times, add expert and non-expert times together if var.startswith('expert'): try: value = value + result[lang][prob][var.replace( 'expert', '')] except KeyError: pass lvalues.append(value) values.extend(lvalues) lmin = min([x for x in lvalues if x != 0]) nvalues.extend([(lambda x: x / lmin)(la) for la in lvalues]) # plot histogram of actual times r.pdf('bargraph-codingtime-var-' + var + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Time': FloatVector(values), }) dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Coding time (in minutes)")') pp.plot() # plot histogram of times normalized with respect to fastest time for a problem r.pdf('bargraph-codingtime-var-norm-' + var + '.pdf', height=pdf_height(), width=pdf_width()) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Time': FloatVector(nvalues), }) dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Coding time (normalized to fastest)")') pp.plot() r['dev.off']()
def line_plot (cfg, var, control, change_name, changing, selector, base_selector, basis): speedups = [] thrds = [] changes = [] lowers = [] uppers = [] for n in cfg.threads: probs.append ('ideal') langs.append ('ideal') speedups.append (n) thrds.append (n) changes.append ('ideal') lowers.append (n) uppers.append (n) for c in changing: sel = selector (c) # sequential base base = FloatVector (base_selector(c)) # base with p = 1 base_p1 = FloatVector (sel(1)) # use fastest sequential program if basis == 'fastest' and mean (base_p1) < mean(base): base = base_p1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 for n in cfg.threads: ntimes = FloatVector (sel(n)) # ratio confidence interval labels = ['Base'] * r.length(base)[0] + ['N']*r.length (ntimes)[0] df = DataFrame ({'Times': base + ntimes, 'Type': StrVector(labels)}) ratio_test = r['pairwiseCI'] (r('Times ~ Type'), data=df, control='N', method='Param.ratio', **{'var.equal': False, 'conf.level': 0.999})[0][0] lowers.append (ratio_test[1][0]) uppers.append (ratio_test[2][0]) mn = mean (ntimes) speedups.append (mean(base) / mn) # plot slowdowns #speedups.append (-mn/base)#(base / mn) thrds.append (n) if change_name == 'Language': changes.append (pretty_langs [c]) else: changes.append (c) df = DataFrame ({'Speedup': FloatVector (speedups), 'Threads': IntVector (thrds), change_name: StrVector (changes), 'Lower': FloatVector (lowers), 'Upper': FloatVector (uppers) }) ideal_changing = ['ideal'] if change_name == 'Language': ideal_changing.extend ([pretty_langs [c] for c in changing]) else: ideal_changing.extend (changing) legendVec = IntVector (range (len (ideal_changing))) legendVec.names = StrVector (ideal_changing) gg = ggplot2.ggplot (df) limits = ggplot2.aes (ymax = 'Upper', ymin = 'Lower') dodge = ggplot2.position_dodge (width=0.9) pp = gg + \ ggplot2.geom_line() + ggplot2.geom_point(size=3) +\ ggplot2.aes_string(x='Threads', y='Speedup', group=change_name, color=change_name, shape=change_name) + \ ggplot2.scale_shape_manual(values=legendVec) + \ ggplot2.geom_errorbar (limits, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 15, vjust=-0.2)}) + \ robjects.r('ylab("Speedup")') + \ robjects.r('xlab("Cores")') # ggplot2.xlim (min(threads), max(threads)) + ggplot2.ylim(min(threads), max(threads)) +\ pp.plot() r['dev.off']()
def as_dataframe (cfg, results, basis): r = robjects.r varis = [] langs = [] probs = [] times = [] threads = [] # speedups, with upper and lower bounds below speedups = [] speedup_lowers = [] speedup_uppers = [] ses = [] # standard errors mems = [] # memory usage langs_ideal = list (cfg.languages) langs_ideal.append ('ideal') probs_ideal = list (cfg.problems) probs_ideal.append ('ideal') for var in cfg.variations: for lang in langs_ideal: # cfg.languages: for prob in probs_ideal: # cfg.problems: for thread in cfg.threads: if lang == 'ideal' and prob == 'ideal': continue elif lang == 'ideal' or prob == 'ideal': varis.append (var) langs.append (pretty_langs[lang]) probs.append (prob) threads.append (thread) speedups.append (thread) speedup_lowers.append (thread) speedup_uppers.append (thread) times.append (0) ses.append(0) mems.append (0) continue varis.append (var) # pretty_varis [var]) langs.append (pretty_langs [lang]) probs.append (prob) threads.append (thread) if var.find('seq') >= 0: thread = cfg.threads[-1] vals = FloatVector (results[thread][prob][var][lang][0]) time = mean (vals) times.append (time) # # time confidence interval # t_result = r['t.test'] (FloatVector(vals), **{" conf.level": 0.999}).rx ('conf.int')[0] ses.append ((t_result[1] - t_result[0])/2) # # memory usage # mem_filename = get_mem_output (lang, prob, var) with open (mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append (float (mem)) # we include dummy data for the sequential case to avoid the # speedup calculation below if var.find('seq') >= 0: speedups.append (1) speedup_lowers.append (1) speedup_uppers.append (1) continue # # speedup values and confidence intervals # seq_vals = results[cfg.threads[-1]][prob][var.replace ('par', 'seq')][lang][0] # sequential base base = FloatVector (seq_vals) # base with p = 1 base_p1 = FloatVector (results[1][prob][var][lang][0]) # use fastest sequential program if basis == 'fastest' and mean (base_p1) < mean(base): base = base_p1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 labels = ['Base'] * r.length(base)[0] + ['N']*r.length (vals)[0] df = DataFrame ({'Times': base + vals, 'Type': StrVector(labels)}) ratio_test = r['pairwiseCI'] (r('Times ~ Type'), data=df, control='N', method='Param.ratio', **{'var.equal': False})[0][0] speedups.append (mean(base) / time) speedup_lowers.append (ratio_test[1][0]) speedup_uppers.append (ratio_test[2][0]) df = robjects.DataFrame({'Language': StrVector (langs), 'Problem': StrVector (probs), 'Variation' : StrVector (varis), 'Threads': IntVector (threads), 'Time': FloatVector (times), 'SE': FloatVector (ses), 'Speedup': FloatVector (speedups), 'SpeedupLower': FloatVector (speedup_lowers), 'SpeedupUpper': FloatVector (speedup_uppers), 'Mem' : FloatVector (mems) }) r.assign ('df', df) r ('save (df, file="performance.Rda")') # reshape the data to make variation not a column itself, but a part of # the other columns describe ie, time, speedup, etc. # # also, remove the 'ideal' problem as we don't want it in this plot. df = r(''' redf = reshape (df, timevar="Variation", idvar = c("Language","Problem","Threads"), direction="wide") redf$Problem <- factor(redf$Problem, levels = c("randmat","thresh","winnow","outer","product","chain")) redf[which(redf$Problem != "ideal"),] ''') r.pdf ('speedup-expertpar-all.pdf', height=6.5, width=10) change_name = 'Language' legendVec = IntVector (range (len (langs_ideal))) legendVec.names = StrVector (langs_ideal) gg = ggplot2.ggplot (df) limits = ggplot2.aes (ymax = 'SpeedupUpper.expertpar', ymin = 'SpeedupLower.expertpar') dodge = ggplot2.position_dodge (width=0.9) pp = gg + \ ggplot2.geom_line() + ggplot2.geom_point(size=2.5) +\ robjects.r('scale_color_manual(values = c("#ffcb7e", "#1da06b", "#b94646", "#00368a", "#CCCCCC"))') +\ ggplot2.aes_string(x='Threads', y='Speedup.expertpar', group=change_name, color=change_name, shape=change_name) + \ ggplot2.geom_errorbar (limits, width=0.25) + \ ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, vjust=-0.2), 'axis.title.y' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, angle=90, vjust=0.2), 'axis.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'axis.text.y' : ggplot2.theme_text(family = 'serif', size = 10), 'legend.title' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10), 'legend.text' : ggplot2.theme_text(family = 'serif', size = 10), 'strip.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'aspect.ratio' : 1, }) + \ robjects.r('ylab("Speedup")') + \ robjects.r('xlab("Number of cores")') + \ ggplot2.facet_wrap ('Problem', nrow = 2) pp.plot() r['dev.off']()
def bargraph_variation (cfg, values): r = robjects.r for var in cfg.variations: # each variation gets plot avgs = [] ses = [] # normalized values navgs = [] nses = [] langs = [] probs = [] for prob in cfg.problems: # aggregate by problems lavgs = [] lses = [] for lang in cfg.languages: # each problem displays a list of language times for that problem data = FloatVector (values[prob][var][lang][0]) langs.append (pretty_langs [lang]) probs.append (prob) mean = r['mean'] (data)[0] lavgs.append (mean) t_result = r['t.test'] (data, **{"conf.level": 0.999}).rx ('conf.int')[0] lses.append ((t_result[1] - t_result[0])/2) avgs.extend (lavgs) ses.extend (lses) lmin = min (lavgs) navgs.extend ([la/lmin for la in lavgs]) nses.extend ([ls/lmin for ls in lses]) df = robjects.DataFrame({'Language': StrVector (langs), 'Problem': StrVector (probs), 'Time' : FloatVector (avgs), 'SE' : FloatVector (ses), 'NormTime' : FloatVector (navgs), 'NormSE' : FloatVector (nses), 'TimeLabel' : StrVector ([str(round(time, 1)) + "s" for time in avgs]) }) # plot histogram of actual times r.pdf ('bargraph-executiontime-var-' + var + '.pdf', height=pdf_height (), width=pdf_width ()) limits = ggplot2.aes (ymax = 'Time + SE', ymin = 'Time - SE') dodge = ggplot2.position_dodge (width=0.9) gp = ggplot2.ggplot (df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (in seconds)")') pp.plot () # plot histogram of times normalized with respect to fastest time for a problem r.pdf ('bargraph-executiontime-var-norm-' + var + '.pdf', height=pdf_height (), width=pdf_width ()) limits = ggplot2.aes (ymax = 'NormTime + NormSE', ymin = 'NormTime - NormSE') dodge = ggplot2.position_dodge (width=0.9) gp = ggplot2.ggplot (df) pp = gp + \ ggplot2.aes_string (x='Problem', y='NormTime', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) +\ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (normalized to fastest)")') #ggplot2.geom_text(data=df, # mapping = ggplot2.aes_string (x='Problem', # y='NormTime + NormSE + 0.1', # label='TimeLabel') pp.plot () r['dev.off']()
def line_plot(cfg, var, control, change_name, changing, selector, base_selector, basis): speedups = [] thrds = [] changes = [] lowers = [] uppers = [] for n in cfg.threads: probs.append('ideal') langs.append('ideal') speedups.append(n) thrds.append(n) changes.append('ideal') lowers.append(n) uppers.append(n) for c in changing: sel = selector(c) # sequential base base = FloatVector(base_selector(c)) # base with p = 1 base_p1 = FloatVector(sel(1)) # use fastest sequential program if basis == 'fastest' and mean(base_p1) < mean(base): base = base_p1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 for n in cfg.threads: ntimes = FloatVector(sel(n)) # ratio confidence interval labels = ['Base'] * r.length(base)[0] + ['N'] * r.length(ntimes)[0] df = DataFrame({'Times': base + ntimes, 'Type': StrVector(labels)}) ratio_test = r['pairwiseCI'](r('Times ~ Type'), data=df, control='N', method='Param.ratio', **{ 'var.equal': False, 'conf.level': 0.999 })[0][0] lowers.append(ratio_test[1][0]) uppers.append(ratio_test[2][0]) mn = mean(ntimes) speedups.append(mean(base) / mn) # plot slowdowns #speedups.append (-mn/base)#(base / mn) thrds.append(n) if change_name == 'Language': changes.append(pretty_langs[c]) else: changes.append(c) df = DataFrame({ 'Speedup': FloatVector(speedups), 'Threads': IntVector(thrds), change_name: StrVector(changes), 'Lower': FloatVector(lowers), 'Upper': FloatVector(uppers) }) ideal_changing = ['ideal'] if change_name == 'Language': ideal_changing.extend([pretty_langs[c] for c in changing]) else: ideal_changing.extend(changing) legendVec = IntVector(range(len(ideal_changing))) legendVec.names = StrVector(ideal_changing) gg = ggplot2.ggplot(df) limits = ggplot2.aes(ymax='Upper', ymin='Lower') dodge = ggplot2.position_dodge(width=0.9) pp = gg + \ ggplot2.geom_line() + ggplot2.geom_point(size=3) +\ ggplot2.aes_string(x='Threads', y='Speedup', group=change_name, color=change_name, shape=change_name) + \ ggplot2.scale_shape_manual(values=legendVec) + \ ggplot2.geom_errorbar (limits, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 15, vjust=-0.2)}) + \ robjects.r('ylab("Speedup")') + \ robjects.r('xlab("Cores")') # ggplot2.xlim (min(threads), max(threads)) + ggplot2.ylim(min(threads), max(threads)) +\ pp.plot() r['dev.off']()
def as_dataframe(cfg, results, basis): r = robjects.r varis = [] langs = [] probs = [] times = [] threads = [] # speedups, with upper and lower bounds below speedups = [] speedup_lowers = [] speedup_uppers = [] ses = [] # standard errors mems = [] # memory usage langs_ideal = list(cfg.languages) langs_ideal.append('ideal') probs_ideal = list(cfg.problems) probs_ideal.append('ideal') for var in cfg.variations: for lang in langs_ideal: # cfg.languages: for prob in probs_ideal: # cfg.problems: for thread in cfg.threads: if lang == 'ideal' and prob == 'ideal': continue elif lang == 'ideal' or prob == 'ideal': varis.append(var) langs.append(pretty_langs[lang]) probs.append(prob) threads.append(thread) speedups.append(thread) speedup_lowers.append(thread) speedup_uppers.append(thread) times.append(0) ses.append(0) mems.append(0) continue varis.append(var) # pretty_varis [var]) langs.append(pretty_langs[lang]) probs.append(prob) threads.append(thread) if var.find('seq') >= 0: thread = cfg.threads[-1] vals = FloatVector(results[thread][prob][var][lang][0]) time = mean(vals) times.append(time) # # time confidence interval # t_result = r['t.test'](FloatVector(vals), **{ " conf.level": 0.999 }).rx('conf.int')[0] ses.append((t_result[1] - t_result[0]) / 2) # # memory usage # mem_filename = get_mem_output(lang, prob, var) with open(mem_filename, 'r') as mem_file: mem = mem_file.readline() mems.append(float(mem)) # we include dummy data for the sequential case to avoid the # speedup calculation below if var.find('seq') >= 0: speedups.append(1) speedup_lowers.append(1) speedup_uppers.append(1) continue # # speedup values and confidence intervals # seq_vals = results[cfg.threads[-1]][prob][var.replace( 'par', 'seq')][lang][0] # sequential base base = FloatVector(seq_vals) # base with p = 1 base_p1 = FloatVector(results[1][prob][var][lang][0]) # use fastest sequential program if basis == 'fastest' and mean(base_p1) < mean(base): base = base_p1 elif basis == 'seq': pass elif basis == 'p1': base = base_p1 labels = ['Base' ] * r.length(base)[0] + ['N'] * r.length(vals)[0] df = DataFrame({ 'Times': base + vals, 'Type': StrVector(labels) }) ratio_test = r['pairwiseCI'](r('Times ~ Type'), data=df, control='N', method='Param.ratio', **{ 'var.equal': False })[0][0] speedups.append(mean(base) / time) speedup_lowers.append(ratio_test[1][0]) speedup_uppers.append(ratio_test[2][0]) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Variation': StrVector(varis), 'Threads': IntVector(threads), 'Time': FloatVector(times), 'SE': FloatVector(ses), 'Speedup': FloatVector(speedups), 'SpeedupLower': FloatVector(speedup_lowers), 'SpeedupUpper': FloatVector(speedup_uppers), 'Mem': FloatVector(mems) }) r.assign('df', df) r('save (df, file="performance.Rda")') # reshape the data to make variation not a column itself, but a part of # the other columns describe ie, time, speedup, etc. # # also, remove the 'ideal' problem as we don't want it in this plot. df = r(''' redf = reshape (df, timevar="Variation", idvar = c("Language","Problem","Threads"), direction="wide") redf$Problem <- factor(redf$Problem, levels = c("randmat","thresh","winnow","outer","product","chain")) redf[which(redf$Problem != "ideal"),] ''') r.pdf('speedup-expertpar-all.pdf', height=6.5, width=10) change_name = 'Language' legendVec = IntVector(range(len(langs_ideal))) legendVec.names = StrVector(langs_ideal) gg = ggplot2.ggplot(df) limits = ggplot2.aes(ymax='SpeedupUpper.expertpar', ymin='SpeedupLower.expertpar') dodge = ggplot2.position_dodge(width=0.9) pp = gg + \ ggplot2.geom_line() + ggplot2.geom_point(size=2.5) +\ robjects.r('scale_color_manual(values = c("#ffcb7e", "#1da06b", "#b94646", "#00368a", "#CCCCCC"))') +\ ggplot2.aes_string(x='Threads', y='Speedup.expertpar', group=change_name, color=change_name, shape=change_name) + \ ggplot2.geom_errorbar (limits, width=0.25) + \ ggplot2.opts (**{'axis.title.x' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, vjust=-0.2), 'axis.title.y' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10, angle=90, vjust=0.2), 'axis.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'axis.text.y' : ggplot2.theme_text(family = 'serif', size = 10), 'legend.title' : ggplot2.theme_text(family = 'serif', face = 'bold', size = 10), 'legend.text' : ggplot2.theme_text(family = 'serif', size = 10), 'strip.text.x' : ggplot2.theme_text(family = 'serif', size = 10), 'aspect.ratio' : 1, }) + \ robjects.r('ylab("Speedup")') + \ robjects.r('xlab("Number of cores")') + \ ggplot2.facet_wrap ('Problem', nrow = 2) pp.plot() r['dev.off']()
def bargraph_variation(cfg, values): r = robjects.r for var in cfg.variations: # each variation gets plot avgs = [] ses = [] # normalized values navgs = [] nses = [] langs = [] probs = [] for prob in cfg.problems: # aggregate by problems lavgs = [] lses = [] for lang in cfg.languages: # each problem displays a list of language times for that problem data = FloatVector(values[prob][var][lang][0]) langs.append(pretty_langs[lang]) probs.append(prob) mean = r['mean'](data)[0] lavgs.append(mean) t_result = r['t.test'](data, **{ "conf.level": 0.999 }).rx('conf.int')[0] lses.append((t_result[1] - t_result[0]) / 2) avgs.extend(lavgs) ses.extend(lses) lmin = min(lavgs) navgs.extend([la / lmin for la in lavgs]) nses.extend([ls / lmin for ls in lses]) df = robjects.DataFrame({ 'Language': StrVector(langs), 'Problem': StrVector(probs), 'Time': FloatVector(avgs), 'SE': FloatVector(ses), 'NormTime': FloatVector(navgs), 'NormSE': FloatVector(nses), 'TimeLabel': StrVector([str(round(time, 1)) + "s" for time in avgs]) }) # plot histogram of actual times r.pdf('bargraph-executiontime-var-' + var + '.pdf', height=pdf_height(), width=pdf_width()) limits = ggplot2.aes(ymax='Time + SE', ymin='Time - SE') dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='Time', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) + \ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (in seconds)")') pp.plot() # plot histogram of times normalized with respect to fastest time for a problem r.pdf('bargraph-executiontime-var-norm-' + var + '.pdf', height=pdf_height(), width=pdf_width()) limits = ggplot2.aes(ymax='NormTime + NormSE', ymin='NormTime - NormSE') dodge = ggplot2.position_dodge(width=0.9) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string (x='Problem', y='NormTime', fill='Language') + \ ggplot2.geom_bar (position='dodge', stat='identity') + \ ggplot2.geom_errorbar (limits, position=dodge, width=0.25) +\ ggplot2_options () + \ ggplot2_colors () + \ robjects.r('scale_x_discrete(limits=c("randmat", "thresh", "winnow", "outer", "product", "chain"))') +\ robjects.r('ylab("Execution time (normalized to fastest)")') #ggplot2.geom_text(data=df, # mapping = ggplot2.aes_string (x='Problem', # y='NormTime + NormSE + 0.1', # label='TimeLabel') pp.plot() r['dev.off']()