def return_summary_statistics_for_rp(
        csv_file='combined_results_revised.csv',
        measurement_of_interest='[OVERALL] RunTime(ms)',
        d=None,
        wl='a',
        nn=1,
        nt='rp',
        nm='eth',
        cluster_sizes_of_choice={
            '1': 1,
            '3': 3,
            '6': 6
        }):

    if not d:
        d = {'nm': nm, 'nt': nt, 'wl': wl, 't': range(10, 30 + 1)}

    df = pd.read_csv(csv_file)
    df_filtered = rfd(df, d=d)

    df = {}
    s = {}
    for k, v in cluster_sizes_of_choice.iteritems():
        df[k] = rfd(df_filtered, d={'nn': v})[measurement_of_interest]
        s[k] = df[k].describe()

    for x in s.itervalues():
        x['range'] = x['max'] - x['min']

    v = pd.DataFrame(s).reset_index()
    set_display_format_for_floats(format_='{:.2g}'.format)
    return v.to_latex(index=False)
def return_summary_statistics_for_vms(
        csv_file='combined_results_revised.csv',
        measurement_of_interest='[OVERALL] RunTime(ms)',
        d=None,
        wl='a',
        nn=1):

    if not d:
        d = {'nt': 'vm', 'wl': wl, 'nn': nn, 't': range(10, 30 + 1)}

    df = pd.read_csv(csv_file)
    df_filtered = rfd(df, d=d)

    df_1gb = rfd(df_filtered, d={'ram': '1GB'})[measurement_of_interest]
    df_2gb = rfd(df_filtered, d={'ram': '2GB'})[measurement_of_interest]
    df_4gb = rfd(df_filtered, d={'ram': '4GB'})[measurement_of_interest]

    s = df_1gb.describe()
    t = df_2gb.describe()
    u = df_4gb.describe()

    for x in [s, t, u]:
        x['range'] = x['max'] - x['min']

    v = pd.DataFrame(dict(ram1GB=s, ram2GB=t, ram4GB=u)).reset_index()

    return v.to_latex(index=False)
示例#3
0
def get_max_absolute_deviation_from_ref(
        csv_file='combined_results_revised.csv',
        wl='a',
        list_of_number_of_nodes=None,
        csv_reference_data_file='abramova_results.csv',
        measurement_of_interest='[OVERALL] RunTime(ms)',
        d=None,
        d_ref=None,
        t_range=None,
        node_type='rp',
        nm='eth'):

    # Initialize Variables----------------------------
    if not list_of_number_of_nodes:
        list_of_number_of_nodes = [1, 3, 6]

    if not t_range:
        t_range = range(10, 30 + 1)

    if not d:
        d = {'nt': node_type, 'wl': wl, 'nn': 1, 't': t_range, 'nm': nm}

    if not d_ref:
        d_ref = d
    # ------------------------------------------------

    # Read from CSV File -----------------------------

    df = pd.read_csv(csv_file)
    df_ref = pd.read_csv(csv_reference_data_file)

    # ------------------------------------------------

    #  For each node, get the max's and min's
    bound_dict = {}
    for node_count in list_of_number_of_nodes:
        d['nn'] = node_count  # adjust the filter(s)
        d_ref['nn'] = node_count

        df_flt = rfd(df, d)
        df_ref_flt = rfd(df_ref, d_ref)

        df_summary = df_flt.describe()
        df_ref_summary = df_ref_flt.describe()

        max_ = df_summary[measurement_of_interest]['max']
        min_ = df_summary[measurement_of_interest]['min']

        max_ref = df_ref_summary[measurement_of_interest]['max']
        min_ref = df_ref_summary[measurement_of_interest]['min']

        bound_dict[node_count] = max(
            abs(max_ - min_ref),
            abs(min_ - max_ref))  # assumes one mean is always greater

    bound = max(bound_dict.values())

    return bound
def generate_bound_statements_rp_wired_and_wireless(
        csv_file='combined_results_revised.csv',
        measurement_of_interest='[OVERALL] RunTime(ms)',
        d=None,
        wl='a',
        nn=1,
        nt='rp',
        nm='eth'):

    s = 'The median value of the corresponding wired experiment will serve as the reference in this paragraph. '

    d_wlan = {'nm': 'wlan', 'nt': nt, 'wl': wl, 't': range(10, 30 + 1)}

    d_eth = {'nm': 'eth', 'nt': nt, 'wl': wl, 't': range(10, 30 + 1)}

    ref_values = {
        'a': {
            1: 58430,
            3: 65650,
            6: 87310
        },
        'c': {
            1: 88000,
            3: 90210,
            6: 118090
        },
        'e': {
            1: 223180,
            3: 330820,
            6: 404660
        }
    }

    df = pd.read_csv(csv_file)
    df_filtered_eth = rfd(df, d=d_eth)
    df_filtered_wlan = rfd(df, d=d_wlan)
    for node in [1, 3, 6]:
        df_eth = rfd(df_filtered_eth, d={'nn': node})[measurement_of_interest]
        df_wlan = rfd(df_filtered_wlan, d={'nn':
                                           node})[measurement_of_interest]
        summary_eth = df_eth.describe()
        summary_wlan = df_wlan.describe()
        s += generate_bound_statement(
            max_=summary_wlan['max'],
            min_=summary_wlan['min'],
            series='a node network of {}'.format(node),
            ref=summary_eth['50%'])

    return s
def return_summary_statistics(
        csv_file='results/exp0_cstress/cassandra_stress_results_compression_strategy.csv',
        measurement_of_interest='op/s',
        op='reads_only',
        d=None,
        compression_strategy='LZ4Compressor',
        network_type='wired'):

    if not d:
        d = {
            'compression': compression_strategy,
            'op': op,
            'network_type': network_type
        }

    df = pd.read_csv(csv_file)
    df_filtered = rfd(df, d=d)

    # df = {}
    # s = {}

    # df = rfd(df_filtered, d={'nn': v})[measurement_of_interest]

    s = df_filtered[measurement_of_interest].describe()

    for x in [s]:
        x['range'] = x['max'] - x['min']

    v = pd.DataFrame(s).reset_index()
    set_display_format_for_floats(format_='{:.5g}'.format)

    return v.to_latex(index=False)
def return_general_summary_table(
        main_csv_file='combined_results_revised.csv',
        reference_csv_file='abramova_results.csv',
        trial_list=None,
        measurement_of_interest='[OVERALL] RunTime(ms)',
        desired_index=None,
        desired_columns=None,
        desired_summary_function=np.median):

    if not trial_list:
        trial_list = range(10, 30 + 1)

    if not desired_index:
        desired_index = ['nn', 'wl', 'dbs']

    if not desired_columns:
        desired_columns = ['nt', 'nm', 'ram']

    df = pd.read_csv(main_csv_file)

    df = rfd(df=df, d={'t': trial_list})

    if reference_csv_file:
        df = df.append(pd.read_csv(reference_csv_file))

    table = pd.pivot_table(df,
                           values=measurement_of_interest,
                           index=desired_index,
                           columns=desired_columns,
                           aggfunc=desired_summary_function)

    return table
def anova_for_variation_in_ram_1(
        csv_file='combined_results_revised.csv',
        measurement_of_interest='[OVERALL] RunTime(ms)',
        d=None):

    if not d:
        d = {'nt': 'vm', 'wl': 'a', 'nn': 1}

    df = pd.read_csv(csv_file)
    df_filtered = rfd(df, d=d)

    df_1gb = rfd(df_filtered, d={'ram': '1GB'})[measurement_of_interest]
    df_2gb = rfd(df_filtered, d={'ram': '2GB'})[measurement_of_interest]
    df_4gb = rfd(df_filtered, d={'ram': '4GB'})[measurement_of_interest]

    results = return_anova_summary_table(df_1gb, df_2gb, df_4gb)

    return results
def return_entire_speedup_table(
        workload,
        comparison_description,
        measurement_of_interest='[OVERALL] RunTime(ms)',
        csv_file=None):

    df = pd.read_csv(csv_file)

    dd = []

    d = [None, None]
    df_filtered = [None, None]

    speedup_dictionary = {}

    for cluster_size in range(1, 6 + 1) + [[1, 2, 3, 4, 5, 6]]:

        if comparison_description == 'rp_v_vm':
            nm = ['nodal', 'eth']
            nt = ['vm', 'rp']
        elif comparison_description == 'wlan_v_eth':
            nm = ['eth', 'wlan']
            nt = ['rp', 'rp']
        else:
            nm = 'error'
            nt = 'error'

        for i in [0, 1]:
            d[i] = {
                'nt': nt[i],
                'wl': workload,
                'ram': '1GB',
                'nm': nm[i],
                'nn': cluster_size,
                't': range(10, 30 + 1)
            }

            df_filtered[i] = rfd(df=df, d=d[i])

        x = df_filtered[0][measurement_of_interest]
        y = df_filtered[1][measurement_of_interest]

        speedup_dictionary = return_speedup_stats(x, y)

        if cluster_size == [1, 2, 3, 4, 5, 6]:
            speedup_dictionary['cluster_size'] = 'OVERALL'
        else:
            speedup_dictionary['cluster_size'] = cluster_size

        dd.append(speedup_dictionary)

    table_in_dataframe_format = pd.DataFrame(dd)

    table_in_dataframe_format = table_in_dataframe_format.transpose()

    return table_in_dataframe_format.to_latex()
def generate_bound_statements_rp_wired(
        csv_file='combined_results_revised.csv',
        measurement_of_interest='[OVERALL] RunTime(ms)',
        d=None,
        wl='a',
        nn=1,
        nt='rp',
        nm='eth'):
    s = ''
    d = None
    if not d:
        d = {'nm': nm, 'nt': nt, 'wl': wl, 't': range(10, 30 + 1)}
    ref_values = {
        'a': {
            1: 58430,
            3: 65650,
            6: 87310
        },
        'c': {
            1: 88000,
            3: 90210,
            6: 118090
        },
        'e': {
            1: 223180,
            3: 330820,
            6: 404660
        }
    }

    df = pd.read_csv(csv_file)
    df_filtered = rfd(df, d=d)
    for node in [1, 3, 6]:
        df = rfd(df_filtered, d={'nn': node})[measurement_of_interest]
        summary = df.describe()
        s += generate_bound_statement(
            max_=summary['max'],
            min_=summary['min'],
            series='a node network of {}'.format(node),
            ref=ref_values[wl][node])

    return s
def anova_for_variation_in_ram(csv_file='combined_results_revised.csv',
                               measurement_of_interest='[OVERALL] RunTime(ms)',
                               d=None,
                               wl='a',
                               nn=1):
    if not d:
        d = {'nt': 'vm', 'wl': wl, 'nn': nn, 't': range(10, 30 + 1)}

    df = pd.read_csv(csv_file)
    df_filtered = rfd(df, d=d)

    df_1gb = rfd(df_filtered, d={'ram': '1GB'})[measurement_of_interest]
    df_2gb = rfd(df_filtered, d={'ram': '2GB'})[measurement_of_interest]
    df_4gb = rfd(df_filtered, d={'ram': '4GB'})[measurement_of_interest]

    return return_embedded_latex_tables(
        latex_table_as_string=return_anova_summary_table(
            df_1gb, df_2gb, df_4gb),
        label='ram_variance_analysis_workload_' + wl + '_' + str(nn) + '_node',
        caption='ANOVA Summary Table for '
        'Workload {}, {} Node Cluster'.format(wl.capitalize(), nn))
def return_summary_statistics_tabular(
        workload='a',
        nt='vm',
        ram='1GB',
        nm='nodal',
        csv_file='combined_results_revised.csv',
        measurement_of_interest='[OVERALL] RunTime(ms)'):

    ss = []

    for cluster_size in [1, 2, 3, 4, 5, 6, [1, 2, 3, 4, 5, 6]]:

        d = {
            'nt': nt,
            'wl': workload,
            'ram': ram,
            'nm': nm,
            'nn': cluster_size,
            't': range(10, 30 + 1)
        }

        df = pd.read_csv(csv_file)
        df_filtered = rfd(df, d=d)[measurement_of_interest]

        if not df_filtered.empty:

            s = df_filtered.describe()

            s['range'] = s['max'] - s['min']

            if cluster_size == [1, 2, 3, 4, 5, 6]:
                s['cluster_size'] = "Overall"
            else:
                s['cluster_size'] = cluster_size

            ss.append(s)

    v = pd.DataFrame(ss).reset_index()

    w = pd.pivot_table(v, columns=['cluster_size'])

    return w.to_latex(index=True)
def return_max_min_range_for_all_levels_of_ram(
        csv_file='combined_results_revised.csv',
        measurement_of_interest='[OVERALL] RunTime(ms)',
        d=None,
        wl='a',
        nn=1):

    if not d:
        d = {'nt': 'vm', 'wl': wl, 'nn': nn, 't': range(10, 30 + 1)}

    df = pd.read_csv(csv_file)
    df_filtered = rfd(df, d=d)

    df_Xgb = df_filtered[measurement_of_interest]

    s = df_Xgb.describe()

    for x in [s]:
        x['range'] = x['max'] - x['min']

    return s['max'], s['min'], s['range']
def speedup_analysis_tables(csv_file,
                            comparison_description,
                            workload,
                            measurement_of_interest='[OVERALL] RunTime(ms)'):

    s = ':::something went wrong generating the speedup analysis tables:::'

    df = pd.read_csv(csv_file)

    label = '{}_{}'.format(comparison_description, workload)

    reference_statement = ''  # initialize

    if comparison_description == 'ram_v_ram':

        reference_statement = 'See Table \\ref{{{}}}.'.format(
            'table:{}'.format(label))

        dd = {}

        dd['slope'] = []
        dd['intercept'] = []
        dd['r_value'] = []
        dd['p_value'] = []
        dd['std_err'] = []
        dd['cluster_size'] = []

        for cluster_size in range(1, 6 + 1):

            d = {
                'nt': 'vm',
                'wl': workload,
                'nn': cluster_size,
                't': range(10, 30 + 1)
            }

            df_filtered = rfd(df=df, d=d)

            df_filtered['ram_in_gb'] = df_filtered['ram'].map(
                convert_ram_text_to_gb)

            if not df_filtered.empty:

                x = df_filtered['ram_in_gb']
                y = df_filtered[measurement_of_interest]
                slope, intercept, r_value, p_value, std_err = stats.linregress(
                    x, y)

                dd['slope'].append(slope)
                dd['intercept'].append(intercept)
                dd['r_value'].append(r_value)
                dd['p_value'].append(p_value)
                dd['std_err'].append(std_err)
                dd['cluster_size'].append(cluster_size)

        dd = pd.DataFrame(dd)

        s = return_embedded_latex_tables(
            latex_table_as_string=dd.to_latex(index=False,
                                              columns=[
                                                  'cluster_size', 'slope',
                                                  'intercept', 'r_value',
                                                  'p_value', 'std_err'
                                              ]),
            caption='Linear Regression over amount of RAM',
            label=label)

    elif comparison_description in ['rp_only', 'wlan_only']:

        reference_statement = 'See Table \\ref{{{}}}.'.format(
            'table:{}'.format(label))

        dd = {}

        dd['slope'] = []
        dd['intercept'] = []
        dd['r_value'] = []
        dd['p_value'] = []
        dd['std_err'] = []

        if comparison_description == 'rp_only':
            nm = 'eth'
        elif comparison_description == 'wlan_only':
            nm = 'wlan'
        else:
            nm = 'error'

        d = {
            'nt': 'rp',
            'wl': workload,
            'ram': '1GB',
            'nm': nm,
            't': range(10, 30 + 1)
        }

        df_filtered = rfd(df=df, d=d)

        x = df_filtered['nn']
        y = df_filtered[measurement_of_interest]
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

        dd['slope'].append(slope)
        dd['intercept'].append(intercept)
        dd['r_value'].append(r_value)
        dd['p_value'].append(p_value)
        dd['std_err'].append(std_err)

        dd = pd.DataFrame(dd)

        s = return_embedded_latex_tables(
            latex_table_as_string=dd.to_latex(index=False,
                                              columns=[
                                                  'slope', 'intercept',
                                                  'r_value', 'p_value',
                                                  'std_err'
                                              ]),
            caption='Linear Regression over Cluster Size, Workload {}'.format(
                workload.capitalize()),
            label=label)

    elif comparison_description in ['rp_v_vm', 'wlan_v_eth']:

        dd = {}

        dd['slope'] = []
        dd['intercept'] = []
        dd['r_value'] = []
        dd['p_value'] = []
        dd['std_err'] = []
        dd['cluster_size'] = []

        for cluster_size in range(1, 6 + 1):

            d = {
                'nt': ['vm', 'rp'],
                'wl': workload,
                'nn': cluster_size,
                'ram': '1GB',
                't': range(10, 30 + 1)
            }

            # Tune the filter
            if comparison_description in ['rp_v_vm']:
                d['nm'] = ['eth', 'nodal']
                d['nt'] = ['rp', 'vm']

            elif comparison_description in ['wlan_v_eth']:
                d['nm'] = ['eth', 'wlan']
                d['nt'] = 'rp'

            df_filtered = rfd(df=df, d=d)

            if comparison_description in ['rp_v_vm']:
                name_of_new_column = 'is_limited_hardware'
                df_filtered[name_of_new_column] = df_filtered['nt'].map(
                    assign_contrast_rp_v_vm)
            elif comparison_description in ['wlan_v_eth']:
                name_of_new_column = 'is_wireless_lan'
                df_filtered[name_of_new_column] = df_filtered['nm'].map(
                    assign_contrast_wired_v_wireless)
            else:
                name_of_new_column = 'error_occurred'

            x = df_filtered[name_of_new_column]
            y = df_filtered[measurement_of_interest]
            slope, intercept, r_value, p_value, std_err = stats.linregress(
                x, y)

            dd['slope'].append(slope)
            dd['intercept'].append(intercept)
            dd['r_value'].append(r_value)
            dd['p_value'].append(p_value)
            dd['std_err'].append(std_err)
            dd['cluster_size'].append(cluster_size)

        cluster_size = [1, 2, 3, 4, 5, 6]

        # -- Now append the Overall, over 1,2,3,4,5,6 -- #
        d['nn'] = cluster_size

        df_filtered = rfd(df=df, d=d)

        if comparison_description in ['rp_v_vm']:
            name_of_new_column = 'is_limited_hardware'
            df_filtered[name_of_new_column] = df_filtered['nt'].map(
                assign_contrast_rp_v_vm)
        elif comparison_description in ['wlan_v_eth']:
            name_of_new_column = 'is_wireless_lan'
            df_filtered[name_of_new_column] = df_filtered['nm'].map(
                assign_contrast_wired_v_wireless)

        x = df_filtered[name_of_new_column]
        y = df_filtered[measurement_of_interest]
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

        dd['slope'].append(slope)
        dd['intercept'].append(intercept)
        dd['r_value'].append(r_value)
        dd['p_value'].append(p_value)
        dd['std_err'].append(std_err)
        dd['cluster_size'].append('OVERALL')

        # --- Now convert to dataframe ---

        dd = pd.DataFrame(dd)

        if comparison_description in ['rp_v_vm']:
            caption = 'Linear Regression over the effect of limited hardware, Workload {}.  The designation (NaN) indicates that data was not collected for this cluster size.  r- values in the high nineties indicate that there is a pronounced effect.  Lower r-values indicate a less pronounced effect, likely attributed to high variance.'.format(
                workload.capitalize())
        elif comparison_description in ['wlan_v_eth']:
            caption = 'Linear Regression over the effect of 802.11 links, Workload {}.  The designation (NaN) indicates that data was not collected for this cluster size.   r-values in the high nineties indicate that there is a pronounced effect.  Lower r-values indicate a less pronounced effect, likely attributed to high variance.'.format(
                workload.capitalize())
        else:
            caption = 'Something went wrong with the caption assignment.'

        insert0 = return_embedded_latex_tables(
            latex_table_as_string=dd.to_latex(index=False,
                                              columns=[
                                                  'cluster_size', 'slope',
                                                  'intercept', 'r_value',
                                                  'p_value', 'std_err'
                                              ]),
            caption=caption,
            label=label)

        # -- Get the speedup statistics --

        if comparison_description in ['rp_v_vm']:
            caption_for_speedup = 'Speedup over the effect of limited hardware, Workload {}'.format(
                workload.capitalize())
        elif comparison_description in ['wlan_v_eth']:
            caption_for_speedup = 'Speedup over the effect of 802.11 links, Workload {}'.format(
                workload.capitalize())
        else:
            caption_for_speedup = 'Something went wrong with the caption assignment.'

        label_for_speedup = label + '_speedup'

        entire_speedup_table = return_entire_speedup_table(
            workload=workload,
            comparison_description=comparison_description,
            measurement_of_interest='[OVERALL] RunTime(ms)',
            csv_file=csv_file)

        reference_statement = 'See Tables \\ref{{{}}} and \\ref{{{}}}.'.format(
            'table:{}'.format(label), 'table:{}'.format(label_for_speedup))

        insert1 = return_embedded_latex_tables(
            latex_table_as_string=entire_speedup_table,
            caption=caption_for_speedup,
            label=label_for_speedup)

        s = '\n\n' + insert0 + '\n\n' + insert1 + '\n\n'

    elif comparison_description in ['rp_v_ref', 'vm_v_ref']:
        s = ''

    s = reference_statement + '\n\n' + s

    return s