def return_summary_statistics_for_rp( csv_file='combined_results_revised.csv', measurement_of_interest='[OVERALL] RunTime(ms)', d=None, wl='a', nn=1, nt='rp', nm='eth', cluster_sizes_of_choice={ '1': 1, '3': 3, '6': 6 }): if not d: d = {'nm': nm, 'nt': nt, 'wl': wl, 't': range(10, 30 + 1)} df = pd.read_csv(csv_file) df_filtered = rfd(df, d=d) df = {} s = {} for k, v in cluster_sizes_of_choice.iteritems(): df[k] = rfd(df_filtered, d={'nn': v})[measurement_of_interest] s[k] = df[k].describe() for x in s.itervalues(): x['range'] = x['max'] - x['min'] v = pd.DataFrame(s).reset_index() set_display_format_for_floats(format_='{:.2g}'.format) return v.to_latex(index=False)
def return_summary_statistics_for_vms( csv_file='combined_results_revised.csv', measurement_of_interest='[OVERALL] RunTime(ms)', d=None, wl='a', nn=1): if not d: d = {'nt': 'vm', 'wl': wl, 'nn': nn, 't': range(10, 30 + 1)} df = pd.read_csv(csv_file) df_filtered = rfd(df, d=d) df_1gb = rfd(df_filtered, d={'ram': '1GB'})[measurement_of_interest] df_2gb = rfd(df_filtered, d={'ram': '2GB'})[measurement_of_interest] df_4gb = rfd(df_filtered, d={'ram': '4GB'})[measurement_of_interest] s = df_1gb.describe() t = df_2gb.describe() u = df_4gb.describe() for x in [s, t, u]: x['range'] = x['max'] - x['min'] v = pd.DataFrame(dict(ram1GB=s, ram2GB=t, ram4GB=u)).reset_index() return v.to_latex(index=False)
def get_max_absolute_deviation_from_ref( csv_file='combined_results_revised.csv', wl='a', list_of_number_of_nodes=None, csv_reference_data_file='abramova_results.csv', measurement_of_interest='[OVERALL] RunTime(ms)', d=None, d_ref=None, t_range=None, node_type='rp', nm='eth'): # Initialize Variables---------------------------- if not list_of_number_of_nodes: list_of_number_of_nodes = [1, 3, 6] if not t_range: t_range = range(10, 30 + 1) if not d: d = {'nt': node_type, 'wl': wl, 'nn': 1, 't': t_range, 'nm': nm} if not d_ref: d_ref = d # ------------------------------------------------ # Read from CSV File ----------------------------- df = pd.read_csv(csv_file) df_ref = pd.read_csv(csv_reference_data_file) # ------------------------------------------------ # For each node, get the max's and min's bound_dict = {} for node_count in list_of_number_of_nodes: d['nn'] = node_count # adjust the filter(s) d_ref['nn'] = node_count df_flt = rfd(df, d) df_ref_flt = rfd(df_ref, d_ref) df_summary = df_flt.describe() df_ref_summary = df_ref_flt.describe() max_ = df_summary[measurement_of_interest]['max'] min_ = df_summary[measurement_of_interest]['min'] max_ref = df_ref_summary[measurement_of_interest]['max'] min_ref = df_ref_summary[measurement_of_interest]['min'] bound_dict[node_count] = max( abs(max_ - min_ref), abs(min_ - max_ref)) # assumes one mean is always greater bound = max(bound_dict.values()) return bound
def generate_bound_statements_rp_wired_and_wireless( csv_file='combined_results_revised.csv', measurement_of_interest='[OVERALL] RunTime(ms)', d=None, wl='a', nn=1, nt='rp', nm='eth'): s = 'The median value of the corresponding wired experiment will serve as the reference in this paragraph. ' d_wlan = {'nm': 'wlan', 'nt': nt, 'wl': wl, 't': range(10, 30 + 1)} d_eth = {'nm': 'eth', 'nt': nt, 'wl': wl, 't': range(10, 30 + 1)} ref_values = { 'a': { 1: 58430, 3: 65650, 6: 87310 }, 'c': { 1: 88000, 3: 90210, 6: 118090 }, 'e': { 1: 223180, 3: 330820, 6: 404660 } } df = pd.read_csv(csv_file) df_filtered_eth = rfd(df, d=d_eth) df_filtered_wlan = rfd(df, d=d_wlan) for node in [1, 3, 6]: df_eth = rfd(df_filtered_eth, d={'nn': node})[measurement_of_interest] df_wlan = rfd(df_filtered_wlan, d={'nn': node})[measurement_of_interest] summary_eth = df_eth.describe() summary_wlan = df_wlan.describe() s += generate_bound_statement( max_=summary_wlan['max'], min_=summary_wlan['min'], series='a node network of {}'.format(node), ref=summary_eth['50%']) return s
def return_summary_statistics( csv_file='results/exp0_cstress/cassandra_stress_results_compression_strategy.csv', measurement_of_interest='op/s', op='reads_only', d=None, compression_strategy='LZ4Compressor', network_type='wired'): if not d: d = { 'compression': compression_strategy, 'op': op, 'network_type': network_type } df = pd.read_csv(csv_file) df_filtered = rfd(df, d=d) # df = {} # s = {} # df = rfd(df_filtered, d={'nn': v})[measurement_of_interest] s = df_filtered[measurement_of_interest].describe() for x in [s]: x['range'] = x['max'] - x['min'] v = pd.DataFrame(s).reset_index() set_display_format_for_floats(format_='{:.5g}'.format) return v.to_latex(index=False)
def return_general_summary_table( main_csv_file='combined_results_revised.csv', reference_csv_file='abramova_results.csv', trial_list=None, measurement_of_interest='[OVERALL] RunTime(ms)', desired_index=None, desired_columns=None, desired_summary_function=np.median): if not trial_list: trial_list = range(10, 30 + 1) if not desired_index: desired_index = ['nn', 'wl', 'dbs'] if not desired_columns: desired_columns = ['nt', 'nm', 'ram'] df = pd.read_csv(main_csv_file) df = rfd(df=df, d={'t': trial_list}) if reference_csv_file: df = df.append(pd.read_csv(reference_csv_file)) table = pd.pivot_table(df, values=measurement_of_interest, index=desired_index, columns=desired_columns, aggfunc=desired_summary_function) return table
def anova_for_variation_in_ram_1( csv_file='combined_results_revised.csv', measurement_of_interest='[OVERALL] RunTime(ms)', d=None): if not d: d = {'nt': 'vm', 'wl': 'a', 'nn': 1} df = pd.read_csv(csv_file) df_filtered = rfd(df, d=d) df_1gb = rfd(df_filtered, d={'ram': '1GB'})[measurement_of_interest] df_2gb = rfd(df_filtered, d={'ram': '2GB'})[measurement_of_interest] df_4gb = rfd(df_filtered, d={'ram': '4GB'})[measurement_of_interest] results = return_anova_summary_table(df_1gb, df_2gb, df_4gb) return results
def return_entire_speedup_table( workload, comparison_description, measurement_of_interest='[OVERALL] RunTime(ms)', csv_file=None): df = pd.read_csv(csv_file) dd = [] d = [None, None] df_filtered = [None, None] speedup_dictionary = {} for cluster_size in range(1, 6 + 1) + [[1, 2, 3, 4, 5, 6]]: if comparison_description == 'rp_v_vm': nm = ['nodal', 'eth'] nt = ['vm', 'rp'] elif comparison_description == 'wlan_v_eth': nm = ['eth', 'wlan'] nt = ['rp', 'rp'] else: nm = 'error' nt = 'error' for i in [0, 1]: d[i] = { 'nt': nt[i], 'wl': workload, 'ram': '1GB', 'nm': nm[i], 'nn': cluster_size, 't': range(10, 30 + 1) } df_filtered[i] = rfd(df=df, d=d[i]) x = df_filtered[0][measurement_of_interest] y = df_filtered[1][measurement_of_interest] speedup_dictionary = return_speedup_stats(x, y) if cluster_size == [1, 2, 3, 4, 5, 6]: speedup_dictionary['cluster_size'] = 'OVERALL' else: speedup_dictionary['cluster_size'] = cluster_size dd.append(speedup_dictionary) table_in_dataframe_format = pd.DataFrame(dd) table_in_dataframe_format = table_in_dataframe_format.transpose() return table_in_dataframe_format.to_latex()
def generate_bound_statements_rp_wired( csv_file='combined_results_revised.csv', measurement_of_interest='[OVERALL] RunTime(ms)', d=None, wl='a', nn=1, nt='rp', nm='eth'): s = '' d = None if not d: d = {'nm': nm, 'nt': nt, 'wl': wl, 't': range(10, 30 + 1)} ref_values = { 'a': { 1: 58430, 3: 65650, 6: 87310 }, 'c': { 1: 88000, 3: 90210, 6: 118090 }, 'e': { 1: 223180, 3: 330820, 6: 404660 } } df = pd.read_csv(csv_file) df_filtered = rfd(df, d=d) for node in [1, 3, 6]: df = rfd(df_filtered, d={'nn': node})[measurement_of_interest] summary = df.describe() s += generate_bound_statement( max_=summary['max'], min_=summary['min'], series='a node network of {}'.format(node), ref=ref_values[wl][node]) return s
def anova_for_variation_in_ram(csv_file='combined_results_revised.csv', measurement_of_interest='[OVERALL] RunTime(ms)', d=None, wl='a', nn=1): if not d: d = {'nt': 'vm', 'wl': wl, 'nn': nn, 't': range(10, 30 + 1)} df = pd.read_csv(csv_file) df_filtered = rfd(df, d=d) df_1gb = rfd(df_filtered, d={'ram': '1GB'})[measurement_of_interest] df_2gb = rfd(df_filtered, d={'ram': '2GB'})[measurement_of_interest] df_4gb = rfd(df_filtered, d={'ram': '4GB'})[measurement_of_interest] return return_embedded_latex_tables( latex_table_as_string=return_anova_summary_table( df_1gb, df_2gb, df_4gb), label='ram_variance_analysis_workload_' + wl + '_' + str(nn) + '_node', caption='ANOVA Summary Table for ' 'Workload {}, {} Node Cluster'.format(wl.capitalize(), nn))
def return_summary_statistics_tabular( workload='a', nt='vm', ram='1GB', nm='nodal', csv_file='combined_results_revised.csv', measurement_of_interest='[OVERALL] RunTime(ms)'): ss = [] for cluster_size in [1, 2, 3, 4, 5, 6, [1, 2, 3, 4, 5, 6]]: d = { 'nt': nt, 'wl': workload, 'ram': ram, 'nm': nm, 'nn': cluster_size, 't': range(10, 30 + 1) } df = pd.read_csv(csv_file) df_filtered = rfd(df, d=d)[measurement_of_interest] if not df_filtered.empty: s = df_filtered.describe() s['range'] = s['max'] - s['min'] if cluster_size == [1, 2, 3, 4, 5, 6]: s['cluster_size'] = "Overall" else: s['cluster_size'] = cluster_size ss.append(s) v = pd.DataFrame(ss).reset_index() w = pd.pivot_table(v, columns=['cluster_size']) return w.to_latex(index=True)
def return_max_min_range_for_all_levels_of_ram( csv_file='combined_results_revised.csv', measurement_of_interest='[OVERALL] RunTime(ms)', d=None, wl='a', nn=1): if not d: d = {'nt': 'vm', 'wl': wl, 'nn': nn, 't': range(10, 30 + 1)} df = pd.read_csv(csv_file) df_filtered = rfd(df, d=d) df_Xgb = df_filtered[measurement_of_interest] s = df_Xgb.describe() for x in [s]: x['range'] = x['max'] - x['min'] return s['max'], s['min'], s['range']
def speedup_analysis_tables(csv_file, comparison_description, workload, measurement_of_interest='[OVERALL] RunTime(ms)'): s = ':::something went wrong generating the speedup analysis tables:::' df = pd.read_csv(csv_file) label = '{}_{}'.format(comparison_description, workload) reference_statement = '' # initialize if comparison_description == 'ram_v_ram': reference_statement = 'See Table \\ref{{{}}}.'.format( 'table:{}'.format(label)) dd = {} dd['slope'] = [] dd['intercept'] = [] dd['r_value'] = [] dd['p_value'] = [] dd['std_err'] = [] dd['cluster_size'] = [] for cluster_size in range(1, 6 + 1): d = { 'nt': 'vm', 'wl': workload, 'nn': cluster_size, 't': range(10, 30 + 1) } df_filtered = rfd(df=df, d=d) df_filtered['ram_in_gb'] = df_filtered['ram'].map( convert_ram_text_to_gb) if not df_filtered.empty: x = df_filtered['ram_in_gb'] y = df_filtered[measurement_of_interest] slope, intercept, r_value, p_value, std_err = stats.linregress( x, y) dd['slope'].append(slope) dd['intercept'].append(intercept) dd['r_value'].append(r_value) dd['p_value'].append(p_value) dd['std_err'].append(std_err) dd['cluster_size'].append(cluster_size) dd = pd.DataFrame(dd) s = return_embedded_latex_tables( latex_table_as_string=dd.to_latex(index=False, columns=[ 'cluster_size', 'slope', 'intercept', 'r_value', 'p_value', 'std_err' ]), caption='Linear Regression over amount of RAM', label=label) elif comparison_description in ['rp_only', 'wlan_only']: reference_statement = 'See Table \\ref{{{}}}.'.format( 'table:{}'.format(label)) dd = {} dd['slope'] = [] dd['intercept'] = [] dd['r_value'] = [] dd['p_value'] = [] dd['std_err'] = [] if comparison_description == 'rp_only': nm = 'eth' elif comparison_description == 'wlan_only': nm = 'wlan' else: nm = 'error' d = { 'nt': 'rp', 'wl': workload, 'ram': '1GB', 'nm': nm, 't': range(10, 30 + 1) } df_filtered = rfd(df=df, d=d) x = df_filtered['nn'] y = df_filtered[measurement_of_interest] slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) dd['slope'].append(slope) dd['intercept'].append(intercept) dd['r_value'].append(r_value) dd['p_value'].append(p_value) dd['std_err'].append(std_err) dd = pd.DataFrame(dd) s = return_embedded_latex_tables( latex_table_as_string=dd.to_latex(index=False, columns=[ 'slope', 'intercept', 'r_value', 'p_value', 'std_err' ]), caption='Linear Regression over Cluster Size, Workload {}'.format( workload.capitalize()), label=label) elif comparison_description in ['rp_v_vm', 'wlan_v_eth']: dd = {} dd['slope'] = [] dd['intercept'] = [] dd['r_value'] = [] dd['p_value'] = [] dd['std_err'] = [] dd['cluster_size'] = [] for cluster_size in range(1, 6 + 1): d = { 'nt': ['vm', 'rp'], 'wl': workload, 'nn': cluster_size, 'ram': '1GB', 't': range(10, 30 + 1) } # Tune the filter if comparison_description in ['rp_v_vm']: d['nm'] = ['eth', 'nodal'] d['nt'] = ['rp', 'vm'] elif comparison_description in ['wlan_v_eth']: d['nm'] = ['eth', 'wlan'] d['nt'] = 'rp' df_filtered = rfd(df=df, d=d) if comparison_description in ['rp_v_vm']: name_of_new_column = 'is_limited_hardware' df_filtered[name_of_new_column] = df_filtered['nt'].map( assign_contrast_rp_v_vm) elif comparison_description in ['wlan_v_eth']: name_of_new_column = 'is_wireless_lan' df_filtered[name_of_new_column] = df_filtered['nm'].map( assign_contrast_wired_v_wireless) else: name_of_new_column = 'error_occurred' x = df_filtered[name_of_new_column] y = df_filtered[measurement_of_interest] slope, intercept, r_value, p_value, std_err = stats.linregress( x, y) dd['slope'].append(slope) dd['intercept'].append(intercept) dd['r_value'].append(r_value) dd['p_value'].append(p_value) dd['std_err'].append(std_err) dd['cluster_size'].append(cluster_size) cluster_size = [1, 2, 3, 4, 5, 6] # -- Now append the Overall, over 1,2,3,4,5,6 -- # d['nn'] = cluster_size df_filtered = rfd(df=df, d=d) if comparison_description in ['rp_v_vm']: name_of_new_column = 'is_limited_hardware' df_filtered[name_of_new_column] = df_filtered['nt'].map( assign_contrast_rp_v_vm) elif comparison_description in ['wlan_v_eth']: name_of_new_column = 'is_wireless_lan' df_filtered[name_of_new_column] = df_filtered['nm'].map( assign_contrast_wired_v_wireless) x = df_filtered[name_of_new_column] y = df_filtered[measurement_of_interest] slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) dd['slope'].append(slope) dd['intercept'].append(intercept) dd['r_value'].append(r_value) dd['p_value'].append(p_value) dd['std_err'].append(std_err) dd['cluster_size'].append('OVERALL') # --- Now convert to dataframe --- dd = pd.DataFrame(dd) if comparison_description in ['rp_v_vm']: caption = 'Linear Regression over the effect of limited hardware, Workload {}. The designation (NaN) indicates that data was not collected for this cluster size. r- values in the high nineties indicate that there is a pronounced effect. Lower r-values indicate a less pronounced effect, likely attributed to high variance.'.format( workload.capitalize()) elif comparison_description in ['wlan_v_eth']: caption = 'Linear Regression over the effect of 802.11 links, Workload {}. The designation (NaN) indicates that data was not collected for this cluster size. r-values in the high nineties indicate that there is a pronounced effect. Lower r-values indicate a less pronounced effect, likely attributed to high variance.'.format( workload.capitalize()) else: caption = 'Something went wrong with the caption assignment.' insert0 = return_embedded_latex_tables( latex_table_as_string=dd.to_latex(index=False, columns=[ 'cluster_size', 'slope', 'intercept', 'r_value', 'p_value', 'std_err' ]), caption=caption, label=label) # -- Get the speedup statistics -- if comparison_description in ['rp_v_vm']: caption_for_speedup = 'Speedup over the effect of limited hardware, Workload {}'.format( workload.capitalize()) elif comparison_description in ['wlan_v_eth']: caption_for_speedup = 'Speedup over the effect of 802.11 links, Workload {}'.format( workload.capitalize()) else: caption_for_speedup = 'Something went wrong with the caption assignment.' label_for_speedup = label + '_speedup' entire_speedup_table = return_entire_speedup_table( workload=workload, comparison_description=comparison_description, measurement_of_interest='[OVERALL] RunTime(ms)', csv_file=csv_file) reference_statement = 'See Tables \\ref{{{}}} and \\ref{{{}}}.'.format( 'table:{}'.format(label), 'table:{}'.format(label_for_speedup)) insert1 = return_embedded_latex_tables( latex_table_as_string=entire_speedup_table, caption=caption_for_speedup, label=label_for_speedup) s = '\n\n' + insert0 + '\n\n' + insert1 + '\n\n' elif comparison_description in ['rp_v_ref', 'vm_v_ref']: s = '' s = reference_statement + '\n\n' + s return s