示例#1
0
    diff_data.shape
    plt_dat = np.average(diff_data[:, :, :, :, tidx], 4)
    plt_dat = np.average(plt_dat[:, :, :, fidx], 3)
    plt_dat = plt_dat[:, :, cidx]

    plt_dat = pd.DataFrame(data=np.swapaxes(plt_dat, 1, 0), columns=conditions)
    plt_dat = pd.melt(plt_dat, var_name='Condition', value_name='Power')

    pt.half_violinplot(x='Condition',
                       y="Power",
                       data=plt_dat,
                       inner=None,
                       jitter=True,
                       color=".7",
                       lwidth=0,
                       width=0.6,
                       offset=0.17,
                       cut=1,
                       ax=axes[-1],
                       linewidth=1,
                       alpha=0.6,
                       palette=[param['palette'][0], param['palette'][3]],
                       zorder=19)
    sns.stripplot(x='Condition',
                  y="Power",
                  data=plt_dat,
                  jitter=0.08,
                  ax=axes[-1],
                  linewidth=1,
                  alpha=0.6,
                  palette=[param['palette'][0], param['palette'][3]],
示例#2
0
def boxplot_freqs(foi, chan, time, gavg, data_all, ax, pal):
    # Colour palette for plotting
    c = 'CS-1'
    fidx = np.arange(
        np.where(gavg[c].freqs == foi[0])[0],
        np.where(gavg[c].freqs == foi[1])[0])

    times = gavg[c].times
    tidx = np.arange(np.argmin(np.abs(times - time[0])),
                     np.argmin(np.abs(times - time[1])))
    cidx = gavg[c].ch_names.index(chan)

    plt_dat = data_all[:, :, cidx, :, :]
    plt_dat = plt_dat[:, :, fidx, :]
    plt_dat = plt_dat[:, :, :, tidx]
    plt_dat = np.average(plt_dat, 3)
    plt_dat.shape
    plt_dat = np.average(plt_dat, 2)

    plt_dat = pd.DataFrame(data=np.swapaxes(plt_dat, 1, 0),
                           columns=['CS-1', 'CS-2', 'CS-E', 'CS+'])
    plt_dat = pd.melt(plt_dat, var_name='Condition', value_name='Power')

    pt.half_violinplot(x='Condition',
                       y="Power",
                       data=plt_dat,
                       inner=None,
                       jitter=True,
                       color=".7",
                       lwidth=0,
                       width=0.6,
                       offset=0.17,
                       cut=1,
                       ax=ax,
                       linewidth=1,
                       alpha=0.6,
                       palette=pal,
                       zorder=19)
    sns.stripplot(x='Condition',
                  y="Power",
                  data=plt_dat,
                  jitter=0.08,
                  ax=ax,
                  linewidth=1,
                  alpha=0.6,
                  palette=pal,
                  zorder=1)
    sns.boxplot(x='Condition',
                y="Power",
                data=plt_dat,
                palette=pal,
                whis=np.inf,
                linewidth=1,
                ax=ax,
                width=0.1,
                boxprops={
                    "zorder": 10,
                    'alpha': 0.5
                },
                whiskerprops={
                    'zorder': 10,
                    'alpha': 1
                },
                medianprops={
                    'zorder': 11,
                    'alpha': 0.5
                })

    return ax
示例#3
0
def genera_raincloud():

    # Plotting the clouds
    f, ax = plt.subplots(figsize=(7, 5))
    dy = "variety"
    dx = "sepal.length"
    ort = "h"
    pal = sns.color_palette(n_colors=1)
    ax = pt.half_violinplot(x=dx,
                            y=dy,
                            data=df,
                            palette=pal,
                            bw=.2,
                            cut=0.,
                            scale="area",
                            width=.6,
                            inner=None,
                            orient=ort)
    plt.title("Raincloud with Clouds")
    plt.savefig("img/Raincloud_Clouds.png")
    plt.close()

    # Adding the rain
    f, ax = plt.subplots(figsize=(7, 5))
    ax = pt.half_violinplot(x=dx,
                            y=dy,
                            data=df,
                            palette=pal,
                            bw=.2,
                            cut=0.,
                            scale="area",
                            width=.6,
                            inner=None,
                            orient=ort)
    ax = sns.stripplot(x=dx,
                       y=dy,
                       data=df,
                       palette=pal,
                       edgecolor="white",
                       size=3,
                       jitter=0,
                       zorder=0,
                       orient=ort)
    plt.title("Raincloud with Clouds and Rain")
    plt.savefig("img/Raincloud_Clouds_Rain.png")
    plt.close()

    # Adding jitter to the rain
    f, ax = plt.subplots(figsize=(7, 5))
    ax = pt.half_violinplot(x=dx,
                            y=dy,
                            data=df,
                            palette=pal,
                            bw=.2,
                            cut=0.,
                            scale="area",
                            width=.6,
                            inner=None,
                            orient=ort)
    ax = sns.stripplot(x=dx,
                       y=dy,
                       data=df,
                       palette=pal,
                       edgecolor="white",
                       size=3,
                       jitter=1,
                       zorder=0,
                       orient=ort)
    plt.title("Raincloud with Clouds and Jitter rain")
    plt.savefig("img/Raincloud_Clouds_Rain_Jitter.png")

    # Adding the boxplot with quartiles
    f, ax = plt.subplots(figsize=(7, 5))
    ax = pt.half_violinplot(x=dx,
                            y=dy,
                            data=df,
                            palette=pal,
                            bw=.2,
                            cut=0.,
                            scale="area",
                            width=.6,
                            inner=None,
                            orient=ort)
    ax = sns.stripplot(x=dx,
                       y=dy,
                       data=df,
                       palette=pal,
                       edgecolor="white",
                       size=3,
                       jitter=1,
                       zorder=0,
                       orient=ort)
    ax=sns.boxplot( x = dx, y = dy, data = df, color = "black", width = .15, zorder = 10,\
                showcaps = True, boxprops = {'facecolor':'none', "zorder":10},\
                showfliers=True, whiskerprops = {'linewidth':2, "zorder":10},\
                   saturation = 1, orient = ort)

    sns.boxplot( x = dx, y = dy, data = df, color = "black", width = .15, zorder = 10,\
                showcaps = True, boxprops = {'facecolor':'none', "zorder":10},\
                showfliers=True, whiskerprops = {'linewidth':2, "zorder":10},\
                   saturation = 1, orient = ort)
    plt.title("Raincloud with Boxplot")
    plt.savefig("img/Raincloud_Boxplot.png")
    plt.close()

    dx = "variety"
    dy = "sepal.length"
    ort = "h"
    pal = "Set2"
    sigma = .2
    f, ax = plt.subplots(figsize=(7, 5))
    ax = pt.RainCloud(x=dx,
                      y=dy,
                      data=df,
                      palette=pal,
                      bw=sigma,
                      width_viol=.6,
                      ax=ax,
                      orient=ort,
                      move=.2)
    plt.title("Raincloud with Boxplot and Shifted Rain")
    plt.savefig("img/Raincloud_Boxplot_Shifted_Rain.png")
    plt.close()
示例#4
0
    def plot_violins(df):
        plt.figure(figsize=(1.65 * 2, 1.4))

        dx = 'type'
        dy = 'error'
        pal = "tab10"
        ort = 'v'

        df = df[df.error < np.log10(3)]

        ax = pt.half_violinplot(x=dx,
                                y=dy,
                                data=df,
                                palette=pal,
                                bw=.1,
                                cut=0.,
                                scale="width",
                                width=1.,
                                inner=None,
                                orient=ort,
                                linewidth=0.8,
                                offset=0.2)

        ax = sns.stripplot(x=dx,
                           y=dy,
                           data=df.sample(2000),
                           palette=pal,
                           edgecolor="white",
                           size=1,
                           jitter=1,
                           zorder=1,
                           orient=ort,
                           alpha=0.5)

        ax = sns.boxplot(x=dx,
                         y=dy,
                         data=df,
                         color="black",
                         width=.2,
                         zorder=10,
                         showcaps=True,
                         boxprops={
                             'facecolor': 'none',
                             "zorder": 10
                         },
                         showfliers=True,
                         whiskerprops={
                             'linewidth': 1,
                             "zorder": 10
                         },
                         saturation=1,
                         orient=ort,
                         fliersize=0,
                         linewidth=1)

        yticks = [1 / 3, 1 / 2, 1 / 1.5, 1 / 1.2, 1, 1.2, 1.5, 2, 3]
        ax.set_yticks(np.log10(yticks))
        ax.set_yticklabels([r"{:.2f} $\times$".format(y) for y in yticks])
        plt.ylim(-0.03, np.log10(3))
        plt.ylabel("Absolute Error")

        plt.xlim(-0.8, 3.3)
        plt.xticks([0, 1, 2, 3], [
            "POSIX Baseline\nMedian = $1.173\\times$",
            "POSIX + start time\nMedian = $1.117\\times$"
        ],
                   rotation=30)
        plt.xlabel("")

        print("Baseline median absolute error: {}".format(
            10**df[df.type == 'baseline'].error.median()))
        print("Timestamps median absolute error: {}".format(
            10**df[df.type == 'timestamp'].error.median()))

        plt.savefig("figures/figure_4.pdf", dpi=600, bbox_inches='tight')
def raincloud_w_fail(sel_dict_path, lesion_dict_path, plot_type='classes', coi_measure='c_informed', top_layers='all',
                     selected_units=False,
                     plots_dir='simple_rain_plots',
                     plot_fails=False,
                     plot_class_change=False,
                     normed_acts=False,
                     layer_act_dist=False,
                     verbose=False, test_run=False,
                     ):
    """
    With visualise units with raincloud plot.  has distributions (cloud), individual activations (raindrops), boxplot
     to give median and interquartile range.  Also has plot of zero activations, scaled by class size.  Will show items
     that are affected by lesioning in different colours.

     I only have lesion data for [conv2d, dense] layers
     I have GHA and sel data from  [conv2d, activation, max_pooling2d, dense] layers

     so for each lesioned layer [conv2d, dense] I will use the following activation layer to take GHA and sel data from.

     Join these into groups using the activation number as the layer numbers.
     e.g., layer 1 (first conv layer) = conv2d_1 & activation_1.  layer 7 (first fc layer) = dense1 & activation 7)

    :param sel_dict_path:  path to selectivity dict
    :param lesion_dict_path: path to lesion dict
    :param plot_type: all classes or OneVsAll.  if n_cats > 10, should automatically revert to oneVsAll.
    :param coi_measure: measure to use when choosing which class should be the coi.  Either the best performing sel
            measures (c_informed, c_ROC) or max class drop from lesioning.
    :param top_layers: if int, it will just do the top n mayers (excluding output).  If not int, will do all layers.
    :param selected_units: default is to test all units on all layers.  But If I just want individual units, I should be
                    able to input a dict with layer names as keys and a list for each unit on that layer.
                    e.g., to just get unit 216 from 'fc_1' use selected_units={'fc_1': [216]}.
    :param plots_dir: where to save plots
    :param plot_fails: If False, just plots correct items, if true, plots items that failed after lesioning in RED
    :param plot_class_change: if True, plots proportion of items correct per class.
    :param normed_acts: if False use actual activation values, if True, normalize activations 0-1
    :param layer_act_dist: plot the distribution of all activations on a given layer.
                                This should already have been done in GHA
    :param verbose: how much to print to screen
    :param test_run: if True, just plot two units from two layers, if False, plot all (or selected units)

    returns nothings, just saves the plots
    """

    print("\n**** running visualise_units()****")

    if not selected_units:
        print(f"selected_units?: {selected_units}\n"
              "running ALL layers and units")
    else:
        print(focussed_dict_print(selected_units, 'selected_units'))
    # if type(selected_units) is dict:
    #     print("dict found")

    # # lesion dict
    lesion_dict = load_dict(lesion_dict_path)
    focussed_dict_print(lesion_dict, 'lesion_dict')

    # # get key_lesion_layers_list
    lesion_info = lesion_dict['lesion_info']
    lesion_path = lesion_info['lesion_path']
    lesion_highlighs = lesion_info["lesion_highlights"]
    key_lesion_layers_list = list(lesion_highlighs.keys())

    # # remove unnecesary items from key layers list
    if 'highlights' in key_lesion_layers_list:
        key_lesion_layers_list.remove('highlights')
    # if 'output' in key_lesion_layers_list:
    #     key_lesion_layers_list.remove('output')
    # if 'Output' in key_lesion_layers_list:
    #     key_lesion_layers_list.remove('Output')

    # # remove output layers from key layers list
    if any("utput" in s for s in key_lesion_layers_list):
        output_layers = [s for s in key_lesion_layers_list if "utput" in s]
        output_idx = []
        for out_layer in output_layers:
            output_idx.append(key_lesion_layers_list.index(out_layer))
        min_out_idx = min(output_idx)
        key_lesion_layers_list = key_lesion_layers_list[:min_out_idx]

    # # remove output layers from key layers list
    if any("predictions" in s for s in key_lesion_layers_list):
        output_layers = [s for s in key_lesion_layers_list if "predictions" in s]
        output_idx = []
        for out_layer in output_layers:
            output_idx.append(key_lesion_layers_list.index(out_layer))
        min_out_idx = min(output_idx)
        key_lesion_layers_list = key_lesion_layers_list[:min_out_idx]

    class_labels = list(lesion_dict['data_info']['cat_names'].values())

    # # sel_dict
    sel_dict = load_dict(sel_dict_path)
    if key_lesion_layers_list[0] in sel_dict['sel_info']:
        print('\nfound old sel dict layout')
        key_gha_sel_layers_list = list(sel_dict['sel_info'].keys())
        old_sel_dict = True
        # sel_info = sel_dict['sel_info']
        # short_sel_measures_list = list(sel_info[key_lesion_layers_list[0]][0]['sel'].keys())
        # csb_list = list(sel_info[key_lesion_layers_list[0]][0]['class_sel_basics'].keys())
        # sel_measures_list = short_sel_measures_list + csb_list
    else:
        print('\nfound NEW sel dict layout')
        old_sel_dict = False
        sel_info = load_dict(sel_dict['sel_info']['sel_per_unit_pickle_name'])
        # sel_measures_list = list(sel_info[key_lesion_layers_list[0]][0].keys())
        key_gha_sel_layers_list = list(sel_info.keys())
        # print(sel_info.keys())

    # # get key_gha_sel_layers_list
    # # # remove unnecesary items from key layers list
    # if 'sel_analysis_info' in key_gha_sel_layers_list:
    #     key_gha_sel_layers_list.remove('sel_analysis_info')
    # if 'output' in key_gha_sel_layers_list:
    #     output_idx = key_gha_sel_layers_list.index('output')
    #     key_gha_sel_layers_list = key_gha_sel_layers_list[:output_idx]
    # if 'Output' in key_gha_sel_layers_list:
    #     output_idx = key_gha_sel_layers_list.index('Output')
    #     key_gha_sel_layers_list = key_gha_sel_layers_list[:output_idx]

    # # remove output layers from key layers list
    if any("utput" in s for s in key_gha_sel_layers_list):
        output_layers = [s for s in key_gha_sel_layers_list if "utput" in s]
        output_idx = []
        for out_layer in output_layers:
            output_idx.append(key_gha_sel_layers_list.index(out_layer))
        min_out_idx = min(output_idx)
        key_gha_sel_layers_list = key_gha_sel_layers_list[:min_out_idx]
        # key_layers_df = key_layers_df.loc[~key_layers_df['name'].isin(output_layers)]

    # # remove output layers from key layers list
    if any("predictions" in s for s in key_lesion_layers_list):
        output_layers = [s for s in key_lesion_layers_list if "predictions" in s]
        output_idx = []
        for out_layer in output_layers:
            output_idx.append(key_lesion_layers_list.index(out_layer))
        min_out_idx = min(output_idx)
        key_lesion_layers_list = key_lesion_layers_list[:min_out_idx]

    # # put together lists of 1. sel_gha_layers, 2. key_lesion_layers_list.
    n_activation_layers = sum("activation" in layers for layers in key_gha_sel_layers_list)
    n_lesion_layers = len(key_lesion_layers_list)

    if n_activation_layers == n_lesion_layers:
        # # for models where activation and conv (or dense) are separate layers
        n_layers = n_activation_layers
        activation_layers = [layers for layers in key_gha_sel_layers_list if "activation" in layers]
        link_layers_dict = dict(zip(reversed(activation_layers), reversed(key_lesion_layers_list)))

    elif n_activation_layers == 0:
        print("\nno separate activation layers found - use key_lesion_layers_list")
        n_layers = len(key_lesion_layers_list)
        link_layers_dict = dict(zip(reversed(key_lesion_layers_list), reversed(key_lesion_layers_list)))

    else:
        print(f"n_activation_layers: {n_activation_layers}\n{key_gha_sel_layers_list}")
        print("n_lesion_layers: {n_lesion_layers}\n{key_lesion_layers_list}")
        raise TypeError('should be same number of activation layers and lesioned layers')

    if verbose is True:
        focussed_dict_print(link_layers_dict, 'link_layers_dict')

    # # # get info
    exp_cond_path = sel_dict['topic_info']['exp_cond_path']
    output_filename = sel_dict['topic_info']['output_filename']

    # hid acts hdf
    hdf_name = f'{output_filename}_gha.h5'


    # # load data
    # # check for training data
    use_dataset = sel_dict['GHA_info']['use_dataset']

    n_cats = sel_dict['data_info']["n_cats"]

    if use_dataset in sel_dict['data_info']:
        # n_items = sel_dict["data_info"][use_dataset]["n_items"]
        items_per_cat = sel_dict["data_info"][use_dataset]["items_per_cat"]
    else:
        # n_items = sel_dict["data_info"]["n_items"]
        items_per_cat = sel_dict["data_info"]["items_per_cat"]
    if type(items_per_cat) is int:
        items_per_cat = dict(zip(list(range(n_cats)), [items_per_cat] * n_cats))

    if plot_type != 'OneVsAll':
        if n_cats > 20:
            plot_type = 'OneVsAll'
            print("\n\n\nWARNING!  There are lots of classes, it might make a messy plot"
                  "Switching to OneVsAll\n")

    if sel_dict['GHA_info']['gha_incorrect'] == 'False':
        # # only gha for correct items
        # n_items = sel_dict['GHA_info']['scores_dict']['n_correct']
        items_per_cat = sel_dict['GHA_info']['scores_dict']['corr_per_cat_dict']

    # # load hid acts dict called hid_acts.pickle
    """
    Hid_acts dict has numbers as the keys for each layer.
    Some layers (will be missing) as acts only recorded from some layers (e.g., [17, 19, 20, 22, 25, 26, 29, 30])
    hid_acts_dict.keys(): dict_keys([0, 1, 3, 5, 6, 8, 9, 11, 13, 14, 16, 17, 19, 20, 22, 25, 26, 29, 30])
    hid_acts_dict[0].keys(): dict_keys(['layer_name', 'layer_class', 'layer_shape', '2d_acts', 'converted_to_2d'])
    In each layer there is ['layer_name', 'layer_class', 'layer_shape', '2d_acts']
    For 4d layers (conv, pool) there is also, key, value 'converted_to_2d': True
    """

    # # check if I have saved the location to this file
    hid_acts_pickle_name = sel_dict["GHA_info"]["hid_act_files"]['2d']
    if 'gha_path' in sel_dict['GHA_info']:
        gha_path = sel_dict['GHA_info']['gha_path']
        hid_acts_path = os.path.join(gha_path, hid_acts_pickle_name)
    else:
        hid_act_items = 'all'
        if not sel_dict['GHA_info']['gha_incorrect']:
            hid_act_items = 'correct'

        gha_folder = f'{hid_act_items}_{use_dataset}_gha'
        hid_acts_path = os.path.join(exp_cond_path, gha_folder, hid_acts_pickle_name)
    with open(hid_acts_path, 'rb') as pkl:
        hid_acts_dict = pickle.load(pkl)
    print("\nopened hid_acts.pickle")

    # # # visualizing distribution of activations
    # if layer_act_dist:
    #     print("\nPlotting the distributions of activations for each layer")
    #     for k, v in hid_acts_dict.items():
    #         print("\nPlotting distribution of layer acts")
    #         layer_act_dist_dir = 'layer_act_dist'
    #         print(hid_acts_dict[k]['layer_name'])
    #         hid_acts = hid_acts_dict[k]['2d_acts']
    #         print(np.shape(hid_acts))
    #         sns.distplot(np.ravel(hid_acts))
    #         plt.title(str(hid_acts_dict[k]['layer_name']))
    #         dist_plot_name = "{}_{}_layer_act_distplot.png".format(output_filename, hid_acts_dict[k]['layer_name'])
    #         plt.savefig(os.path.join(plots_dir, layer_act_dist_dir, dist_plot_name))
    #         # plt.show()
    #         plt.close()

    # # dict to get the hid_acts_dict key for each layer based on its name
    get_hid_acts_number_dict = dict()
    for key, value in hid_acts_dict.items():
        hid_acts_layer_name = value['layer_name']
        hid_acts_layer_number = key
        get_hid_acts_number_dict[hid_acts_layer_name] = hid_acts_layer_number

    # # where to save files
    save_plots_name = plots_dir
    if plot_type is "OneVsAll":
        save_plots_name = f'{plots_dir}/{coi_measure}'
    save_plots_dir = lesion_dict['GHA_info']['gha_path']
    save_plots_path = os.path.join(save_plots_dir, save_plots_name)
    if test_run:
        save_plots_path = os.path.join(save_plots_path, 'test')
    if not os.path.exists(save_plots_path):
        os.makedirs(save_plots_path)
    os.chdir(save_plots_path)
    print(f"\ncurrent wd: {os.getcwd()}")

    if layer_act_dist:
        layer_act_dist_path = os.path.join(save_plots_path, 'layer_act_dist')
        if not os.path.exists(layer_act_dist_path):
            os.makedirs(layer_act_dist_path)


    print("\n\n**********************"
          "\nlooping through layers"
          "\n**********************\n")

    for layer_index, (gha_layer_name, lesion_layer_name) in enumerate(link_layers_dict.items()):

        if test_run:
            if layer_index > 2:
                continue

        if type(top_layers) is int:
            if top_layers < n_activation_layers:
                if layer_index > top_layers:
                    continue


        # print(f"\nwhich units?: {selected_units}")
        # if selected_units != 'all':
        if selected_units is not False:
            if gha_layer_name not in selected_units:
                print(f"\nselected_units only, skipping layer {gha_layer_name}")
                continue
            else:
                print(f"\nselected_units only, from {gha_layer_name}")
                # print(f"\t{gha_layer_name} in {list(selected_units.keys())}")
                this_layer_units = selected_units[gha_layer_name]
                print(f"\trunning units: {this_layer_units}")

        gha_layer_number = get_hid_acts_number_dict[gha_layer_name]
        layer_dict = hid_acts_dict[gha_layer_number]

        if gha_layer_name != layer_dict['layer_name']:
            raise TypeError("gha_layer_name (from link_layers_dict) and layer_dict['layer_name'] should match! ")

        # hid_acts_array = layer_dict['2d_acts']
        # hid_acts_df = pd.DataFrame(hid_acts_array, dtype=float)

        with h5py.File(hdf_name, 'r') as gha_data:
            hid_acts_array = gha_data['hid_acts_2d'][gha_layer_name]
            hid_acts_df = pd.DataFrame(hid_acts_array)

        # # visualizing distribution of activations
        if layer_act_dist:
            hid_acts = layer_dict['2d_acts']
            print(f"\nPlotting distribution of activations {np.shape(hid_acts)}")
            sns.distplot(np.ravel(hid_acts))
            plt.title(f"{str(layer_dict['layer_name'])} activation distribution")
            dist_plot_name = "{}_{}_layer_act_distplot.png".format(output_filename, layer_dict['layer_name'])
            plt.savefig(os.path.join(layer_act_dist_path, dist_plot_name))
            if test_run:
                plt.show()
            plt.close()


        # # load item change details
        """# # four possible states
            full model      after_lesion    code
        1.  1 (correct)     0 (wrong)       -1
        2.  0 (wrong)       0 (wrong)       0
        3.  1 (correct)     1 (correct)     1
        4.  0 (wrong)       1 (correct)     2

        """
        item_change_df = pd.read_csv(f"{lesion_path}/{output_filename}_{lesion_layer_name}_item_change.csv",
                                     header=0, dtype=int, index_col=0)

        prop_change_df = pd.read_csv(f'{lesion_path}/{output_filename}_{lesion_layer_name}_prop_change.csv',
                                     header=0,
                                     # dtype=float,
                                     index_col=0)

        if verbose:
            print("\n*******************************************"
                  f"\n{layer_index}. gha layer {gha_layer_number}: {gha_layer_name} \tlesion layer: {lesion_layer_name}"
                  "\n*******************************************")
            # focussed_dict_print(hid_acts_dict[layer_index])
            print(f"\n\thid_acts {gha_layer_name} shape: {hid_acts_df.shape}")
            print(f"\tloaded: {output_filename}_{lesion_layer_name}_item_change.csv: {item_change_df.shape}")

        units_per_layer = len(hid_acts_df.columns)

        print("\n\n\t**** loop through units ****")
        for unit_index, unit in enumerate(hid_acts_df.columns):

            if test_run:
                if unit_index > 2:
                    continue

            # if selected_units != 'all':
            if selected_units is not False:
                if unit not in this_layer_units:
                    # print(f"skipping unit {gha_layer_name} {unit}")
                    continue
                else:
                    print(f"\nrunning unit {gha_layer_name} {unit}")

            # # check unit is in sel_per_unit_dict
            if unit in sel_info[gha_layer_name].keys():
                if verbose:
                    print("found unit in dict")
            else:
                print("unit not in dict\n!!!!!DEAD RELU!!!!!!!!\n...on to the next unit\n")
                continue

            lesion_layer_and_unit = f"{lesion_layer_name}_{unit}"
            output_layer_and_unit = f"{lesion_layer_name}_{unit}"


            print("\n\n*************\n"
                  f"running layer {layer_index} of {n_layers} ({gha_layer_name}): unit {unit} of {units_per_layer}\n"
                  "************")

            # # make new df with just [item, hid_acts*, class, item_change*] *for this unit
            unit_df = item_change_df[["item", "class", lesion_layer_and_unit]].copy()
            # print(hid_acts_df)
            this_unit_hid_acts = hid_acts_df.loc[:, unit]


            # # check for dead relus
            if sum(np.ravel(this_unit_hid_acts)) == 0.0:
                print("\n\n!!!!!DEAD RELU!!!!!!!!...on to the next unit\n")
                continue

            if verbose:
                print(f"\tnot a dead unit, hid acts sum: {sum(np.ravel(this_unit_hid_acts)):.2f}")

            unit_df.insert(loc=1, column='hid_acts', value=this_unit_hid_acts)
            unit_df = unit_df.rename(index=str, columns={lesion_layer_and_unit: 'item_change'})

            if verbose is True:
                print(f"\n\tall items - unit_df: {unit_df.shape}")

            # # remove rows where network failed originally and after lesioning this unit - uninteresting
            old_df_length = len(unit_df)
            unit_df = unit_df.loc[unit_df['item_change'] != 0]
            if verbose is True:
                n_fail_fail = old_df_length - len(unit_df)
                print(f"\n\t{n_fail_fail} fail-fail items removed - new shape unit_df: {unit_df.shape}")

            # # get items per class based on their occurences in the dataframe.
            # # this includes fail-pass, pass-pass and pass-fail - but not fail-fail
            no_fail_fail_ipc = unit_df['class'].value_counts(sort=False)

            df_ipc = dict()
            for i in range(n_cats):
                df_ipc[i] = no_fail_fail_ipc[i]

            # # # calculate the proportion of items that failed.
            # # # this is not the same as total_unit_change (which takes into account fail-pass as well as pass-fail)
            # df_ipc_total = sum(df_ipc.values())
            # l_failed_df = unit_df[(unit_df['item_change'] == -1)]
            # l_failed_count = len(l_failed_df)
            #
            # print("\tdf_ipc_total: {}".format(df_ipc_total))
            # print("\tl_failed_count: {}".format(l_failed_count))

            # # getting max_class_drop
            max_class_drop_col = prop_change_df.loc[:, str(unit)]
            total_unit_change = max_class_drop_col['total']
            max_class_drop_col = max_class_drop_col.drop(labels=['total'])
            max_class_drop_val = max_class_drop_col.min()
            max_drop_class = max_class_drop_col.idxmin()
            print(f"\n\tmax_class_drop_val: {max_class_drop_val}\n"
                  f"\tmax_drop_class: {max_drop_class}\n"
                  f"\ttotal_unit_change: {total_unit_change}")

            # # getting best sel measure (max_informed)
            main_sel_name = 'informedness'

            # # includes if statement since some units have not score (dead relu?)
            if old_sel_dict:
                main_sel_val = sel_dict['sel_info'][gha_layer_name][unit]['max']['informed']
                main_sel_class = int(sel_dict['sel_info'][gha_layer_name][unit]['max']['c_informed'])
            else:
                # print(sel_info[gha_layer_name][unit]['max'])
                main_sel_val = sel_info[gha_layer_name][unit]['max']['max_informed']
                main_sel_class = int(sel_info[gha_layer_name][unit]['max']['max_informed_c'])

            print(f"\tmain_sel_val: {main_sel_val}")
            print(f"\tmain_sel_class: {main_sel_class}")

            # # coi stands for Class Of Interest
            # # if doing oneVsAll I need to have a coi measure. (e.g., clas with max informed 'c_informed')
            if plot_type is "OneVsAll":

                # # get coi
                if coi_measure == 'max_class_drop':
                    coi = max_drop_class
                elif coi_measure == 'c_informed':
                    coi = main_sel_class
                else:
                    coi = int(sel_dict['sel_info'][gha_layer_name][unit]['max'][coi_measure])
                print(f"\n\tcoi: {coi}  ({coi_measure})")

                # # get new class labels based on coi, OneVsAll
                all_classes_col = unit_df['class'].astype(int)

                one_v_all_class_list = [1 if x is coi else 0 for x in all_classes_col]
                print(f"\tall_classes_col: {len(all_classes_col)}  one_v_all_class_list: {len(one_v_all_class_list)}")

                if 'OneVsAll' not in list(unit_df):
                    print("\tadding 'OneVsAll'")
                    print("\treplacing all classes with 'OneVsAll'class column")
                    unit_df['class'] = one_v_all_class_list


            min_act = unit_df['hid_acts'].min()

            if normed_acts:
                if min_act >= 0.0:
                    print("\nnormalising activations")
                    this_unit_normed_acts = np.divide(unit_df['hid_acts'], unit_df['hid_acts'].max())
                    unit_df['normed'] = this_unit_normed_acts
                    print(unit_df.head())
                else:
                    print("\ncan not do normed acts on this unit")
                    normed_acts = False


            # # # did any items fail that were previously at zero
            print(f"\n\tsmallest activation on this layer was {min_act}")
            l_failed_df = unit_df[(unit_df['item_change'] == -1)]
            l_failed_df = l_failed_df.sort_values(by=['hid_acts'])

            min_failed_act = l_failed_df['hid_acts'].min()
            print(f"\n\tsmallest activation of items that failed after lesioning was {min_failed_act}")
            if min_failed_act == 0.0:
                fail_zero_df = l_failed_df.loc[l_failed_df['hid_acts'] == 0.0]
                fail_zero_count = len(fail_zero_df.index)
                print(f"\n\tfail_zero_df: {fail_zero_count} items\n\t{fail_zero_df.head()}")
                fail_zero_df.to_csv(f"{output_filename}_{gha_layer_name}_{unit}_fail_zero_df.csv", index=False)


            # # make plot of class changes
            # if plot_fails is True:
            if plot_class_change:
                class_prop_change = prop_change_df.iloc[:-1, unit].to_list()
                print(f"\n\tclass_prop_change: {class_prop_change}")

                # change scale if there are big changes
                class_change_x_min = -.5
                if min(class_prop_change) < class_change_x_min:
                    class_change_x_min = min(class_prop_change)

                class_change_x_max = .1
                if max(class_prop_change) > class_change_x_max:
                    class_change_x_max = max(class_prop_change)

                class_change_curve = sns.barplot(x=class_prop_change, y=class_labels, orient='h')
                class_change_curve.set_xlim([class_change_x_min, class_change_x_max])
                class_change_curve.axvline(0, color="k", clip_on=False)
                plt.subplots_adjust(left=0.15)  # just to fit the label 'automobile' on

                print(f'\nclass num: {class_prop_change.index(min(class_prop_change))}, '
                      f'class label: {class_labels[class_prop_change.index(min(class_prop_change))]}, '
                      f'class_val: {min(class_prop_change):.2f}'
                      )

                plt.title(f"{lesion_layer_and_unit}\n"
                          f"total change: {total_unit_change:.2f} "
                          f"max_class ({class_labels[class_prop_change.index(min(class_prop_change))]}): "
                          f"{min(class_prop_change):.2f}")
                plt.savefig(f"{output_filename}_{output_layer_and_unit}_class_prop_change.png")

                if test_run:
                    plt.show()

                plt.close()



            # # # # # # # # # # # #
            # # raincloud plots # #
            # # # # # # # # # # # #

            # # # plot title
            if plot_fails:
                title = f"Layer: {gha_layer_name} Unit: {unit}\nmax_class_drop: {max_class_drop_val:.2f} " \
                        f"({max_drop_class}), total change: {total_unit_change:.2f}\n" \
                        f"{main_sel_name}: {main_sel_val:.2f} ({main_sel_class})"

                if plot_type == "OneVsAll":
                    title = f"Layer: {gha_layer_name} Unit: {unit} class: {coi}\n" \
                            f"max_class_drop: {max_class_drop_val:.2f} ({max_drop_class}), " \
                            f"total change: {total_unit_change:.2f}" \
                            "\n{main_sel_name}: {main_sel_val:.2f} ({main_sel_class})"
            else:
                title = f"Layer: {gha_layer_name} Unit: {unit}\n" \
                        f"{main_sel_name}: {main_sel_val:.2f} ({main_sel_class})"

                if plot_type == "OneVsAll":
                    title = f"Layer: {gha_layer_name} Unit: {unit} class: {coi}\n" \
                            f"{main_sel_name}: {main_sel_val:.2f} ({main_sel_class})"
            print(f"\ntitle:\n{title}")

            # # # load main dataframe
            raincloud_data = unit_df
            # print(raincloud_data.head())

            plot_y_vals = "class"
            # use_this_ipc = items_per_cat
            use_this_ipc = df_ipc

            if plot_type is "OneVsAll":
                print("\t\n\n\nUSE OneVsAll mode")
                n_cats = 2
                items_per_coi = use_this_ipc[coi]
                other_items = sum(df_ipc.values()) - items_per_coi
                use_this_ipc = {0: other_items, 1: items_per_coi}
                print(f"\tcoi {coi}, items_per_cat {items_per_cat}")

            # # # choose colours
            use_colours = 'tab10'
            if 10 < n_cats < 21:
                use_colours = 'tab20'
            elif n_cats > 20:
                print("\tERROR - more classes than colours!?!?!?")
            sns.set_palette(palette=use_colours, n_colors=n_cats)

            # Make MULTI plot
            fig = plt.figure(figsize=(10, 5))
            gs = gridspec.GridSpec(1, 2, width_ratios=[1, 4])
            zeros_axis = plt.subplot(gs[0])
            rain_axis = plt.subplot(gs[1])

            # # # # # # # # # # # #
            # # make zeros plot # #
            # # # # # # # # # # # #

            # 1. get biggest class size (for max val of plot)
            max_class_size = max(use_this_ipc.values())
            print(f"\tmax_class_size: {max_class_size}")

            # 2. get list or dict of zeros per class
            zeros_dict = {}
            for k in range(n_cats):
                if plot_type is "OneVsAll":
                    plot_names = ["all_others", f"class_{coi}"]
                    this_name = plot_names[k]
                    this_class = unit_df.loc[unit_df['OneVsAll'] == k]
                    zero_count = 0 - (this_class['hid_acts'] == 0).sum()
                    zeros_dict[this_name] = zero_count
                else:
                    this_class = unit_df.loc[unit_df['class'] == k]
                    zero_count = 0 - (this_class['hid_acts'] == 0).sum()
                    zeros_dict[k] = zero_count

            # zd_classes = list(zeros_dict.keys())
            # zd_classes = list(lesion_dict['data_info']['cat_names'].values())
            zd_zero_count = list(zeros_dict.values())

            if verbose:
                print(f"\n\tzeros_dict:{zeros_dict.values()}, use_this_ipc:{use_this_ipc.values()}")

            zd_zero_perc = [x / y * 100 if y else 0 for x, y in zip(zeros_dict.values(), use_this_ipc.values())]

            zd_data = {"class": class_labels, "zero_count": zd_zero_count, "zero_perc": zd_zero_perc}

            zeros_dict_df = pd.DataFrame.from_dict(data=zd_data)

            # zero_plot
            sns.catplot(x="zero_perc", y="class", data=zeros_dict_df, kind="bar", orient='h', ax=zeros_axis)

            zeros_axis.set_xlabel("% at zero (height reflects n items)")

            zeros_axis.set_xlim([-100, 0])

            # # set width of bar to reflect class size
            new_heights = [x / max_class_size for x in use_this_ipc.values()]
            print(f"\tuse_this_ipc: {use_this_ipc}\n\tnew_heights: {new_heights}")

            # def change_height(zeros_axis, new_value):
            patch_count = 0
            for patch in zeros_axis.patches:
                current_height = patch.get_height()
                make_new_height = current_height * new_heights[patch_count]
                diff = current_height - make_new_height

                if new_heights[patch_count] < 1.0:
                    # print("{}. current_height {}, new_height: {}".format(patch, current_height, make_new_height))

                    # # change the bar height
                    patch.set_height(make_new_height)

                    # # recenter the bar
                    patch.set_y(patch.get_y() + diff * .65)

                patch_count = patch_count + 1


            zeros_axis.set_xticklabels(['100', '50', ''])
            # zeros_axis.xaxis.set_major_locator(plt.MaxNLocator(1))
            plt.close()

            # # # # # # # # #
            # # raincloud # #
            # # # # # # # # #

            data_values = "hid_acts"  # float
            if normed_acts:
                data_values = 'normed'
            data_class = plot_y_vals  # class
            orientation = "h"  # orientation

            # cloud_plot
            pt.half_violinplot(data=raincloud_data, bw=.1, linewidth=.5, cut=0., width=1, inner=None,
                               orient=orientation, x=data_values, y=data_class, scale="count")  # scale="area"

            """# # rain_drops - plot 3 separate plots so that they are interesting items are ontop of pass-pass
            # # zorder is order in which items are printed
            # # item_change: 1 ('grey') passed before and after lesioning
            # # -1 ('red') passed in full model but failed when lesioned
            # # 2 ('green') failed in full model but passed in lesioning"""
            fail_palette = {1: "silver", -1: "red", 2: "green", 0: "orange"}


            # # separate rain drops for pass pass,
            pass_pass_df = unit_df[(unit_df['item_change'] == 1)]
            pass_pass_drops = sns.stripplot(data=pass_pass_df, x=data_values, y=data_class, jitter=1, zorder=1,
                                            size=2, orient=orientation)  # , hue='item_change', palette=fail_palette)

            if plot_fails is True:

                '''I'm not using this atm, but if I want to plot items that originally failed and later passed'''
                # # separate raindrop for fail pass
                # fail_pass_df = unit_df[(unit_df['item_change'] == 2)]
                # if not fail_pass_df.empty:
                #     fail_pass_drops = sns.stripplot(data=fail_pass_df, x=data_values, y=data_class, jitter=1,
                #                                     zorder=3, size=4, orient=orientation, hue='item_change',
                #                                     palette=fail_palette, edgecolor='gray', linewidth=.4, marker='s',
                #                                     label='')

                # # separate raindrops for pass fail
                if not l_failed_df.empty:
                    # pass_fail_drops
                    sns.stripplot(data=l_failed_df, x=data_values, y=data_class, jitter=1, zorder=4, size=4,
                                  orient=orientation, hue='item_change', palette=fail_palette, edgecolor='white',
                                  linewidth=.4, marker='s')

            # box_plot
            sns.boxplot(data=raincloud_data, color="gray", orient=orientation, width=.15, x=data_values,
                        y=data_class, zorder=2, showbox=False,
                        # boxprops={'facecolor': 'none', "zorder": 2},
                        showfliers=False, showcaps=False,
                        whiskerprops={'linewidth': .01, "zorder": 2}, saturation=1,
                        # showwhiskers=False,
                        medianprops={'linewidth': .01, "zorder": 2},
                        showmeans=True,
                        meanprops={"marker": "*", "markerfacecolor": "white", "markeredgecolor": "black"}
                        )

            # # Finalize the figure
            rain_axis.set_xlabel("Unit activations")
            if normed_acts:
                rain_axis.set_xlabel("Unit activations (normalised)")

            # new_legend_text = ['l_passed', 'l_failed']
            new_legend_text = ['l_failed']

            leg = pass_pass_drops.axes.get_legend()
            if leg:
                # in here because leg is None if no items changed when this unit was lesioned
                for t, l in zip(leg.texts, new_legend_text):
                    t.set_text(l)

            # # hid ticks and labels from rainplot
            plt.setp(rain_axis.get_yticklabels(), visible=False)
            rain_axis.axes.get_yaxis().set_visible(False)

            # # put plots together
            max_activation = max(this_unit_hid_acts)
            min_activation = min(this_unit_hid_acts)
            if normed_acts:
                max_activation = max(this_unit_normed_acts)
                min_activation = min(this_unit_normed_acts)

            max_x_val = max_activation * 1.05
            layer_act_func = None
            for k, v in lesion_dict['model_info']['layers']['hid_layers'].items():
                if v['name'] == gha_layer_name:
                    layer_act_func = v['act_func']
                    break
            if layer_act_func in ['relu', 'Relu', 'ReLu']:
                min_x_val = 0
            elif min_activation > 0.0:
                min_x_val = 0
            else:
                min_x_val = min_activation

            rain_axis.set_xlim([min_x_val, max_x_val])
            rain_axis.get_shared_y_axes().join(zeros_axis, rain_axis)
            fig.subplots_adjust(wspace=0)

            fig.suptitle(title, fontsize=12).set_position([.5, 1.0])  # .set_bbox([])  #

            # # add y axis back onto rainplot
            plt.axvline(x=min_x_val, linestyle="-", color='black', )

            # # add marker for max informedness
            if 'info' in coi_measure:
                if old_sel_dict:
                    normed_info_thr = sel_dict['sel_info'][gha_layer_name][unit]['max']['thr_informed']
                else:
                    print(sel_info[gha_layer_name][unit]['max'])
                    normed_info_thr = sel_info[gha_layer_name][unit]['max']['max_info_thr']

                if normed_acts:
                    best_info_thr = normed_info_thr
                else:
                    # unnormalise it
                    best_info_thr = normed_info_thr * max(this_unit_hid_acts)
                print(f"\tbest_info_thr: {best_info_thr}")
                plt.axvline(x=best_info_thr, linestyle="--", color='grey')

            # sns.despine(right=True)

            if plot_type is "OneVsAll":
                plt.savefig(f"{output_filename}_{gha_layer_name}_{unit}_cat{coi}_raincloud.png")

            else:
                plt.savefig(f"{output_filename}_{gha_layer_name}_{unit}_raincloud.png")

            if test_run:
                plt.show()

            print("\n\tplot finished\n")

            # # clear for next round
            plt.close()

    # # plt.show()
    print("End of script")
示例#6
0
def generate_figure(data_in, column, path_output):
    dx = np.ones(len(data_in[column]))
    dy = column
    hue = "Manufacturer"
    pal = ["#1E90FF", "#32CD32", "#FF0000"]
    f, ax = plt.subplots(figsize=(4, 6))
    if column == 'CNR_single/t':
        coeff = 100
    else:
        coeff = 1
    ax = pt.half_violinplot(x=dx,
                            y=dy,
                            data=data_in * coeff,
                            hue=hue,
                            palette=pal,
                            bw=.4,
                            cut=0.,
                            linewidth=0.,
                            scale="area",
                            width=.8,
                            inner=None,
                            orient="v",
                            dodge=False,
                            alpha=.4,
                            offset=0.5)
    ax = sns.boxplot(x=dx,
                     y=dy,
                     data=data_in * coeff,
                     hue=hue,
                     color="black",
                     palette=pal,
                     showcaps=True,
                     boxprops={
                         'facecolor': 'none',
                         "zorder": 10
                     },
                     showmeans=True,
                     meanprops={
                         "marker": "^",
                         "markerfacecolor": "black",
                         "markeredgecolor": "black",
                         "markersize": "8"
                     },
                     showfliers=True,
                     whiskerprops={
                         'linewidth': 2,
                         "zorder": 10
                     },
                     saturation=1,
                     orient="v",
                     dodge=True)
    ax = sns.stripplot(x=dx,
                       y=dy,
                       data=data_in * coeff,
                       hue=hue,
                       palette=pal,
                       edgecolor="white",
                       size=3,
                       jitter=1,
                       zorder=0,
                       orient="v",
                       dodge=True)
    plt.xlim([-1, 0.5])
    handles, labels = ax.get_legend_handles_labels()
    # The code below doesn't work (the label for CNR is "GEGEGEGEGEGEG...") so i need to hard-code the labels (because
    # I don't have time to dig further).
    # _ = plt.legend(handles[0:len(labels) // 3], labels[0:len(labels) // 3],
    #                bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,
    #                title=str(hue))
    _ = plt.legend(handles[0:3], ['Philips', 'Siemens', 'GE'],
                   bbox_to_anchor=(1.05, 1),
                   loc=2,
                   borderaxespad=0.,
                   title=str(hue))
    f.gca().invert_xaxis()
    adjust_box_widths(f, 0.6)
    # special hack
    if column == 'CNR_single/t':
        plt.xlabel('CNR_single/√t')
        fname_out = os.path.join(path_output, 'figure_CNR_single_t')
    else:
        plt.xlabel(column)
        fname_out = os.path.join(path_output, 'figure_' + column)
    # remove ylabel
    plt.ylabel('')
    # hide xtick
    plt.tick_params(axis='x',
                    which='both',
                    bottom=False,
                    top=False,
                    labelbottom=False)
    plt.savefig(fname_out, bbox_inches='tight', dpi=300)
示例#7
0
def compare_datasets():
    def get_runtime_results():
        df, features = load_dataset(module="POSIX", remove_runtime=True)
        df_rt, features_rt = load_dataset(module="POSIX", remove_runtime=False)

        posix_train, posix_test = train_on_split(df, features)
        runtime_train, runtime_test = train_on_split(df_rt, features_rt)

        return posix_train, posix_test, runtime_train, runtime_test

    def get_mpiio_results():
        _, features_posix = load_dataset(module="POSIX", remove_runtime=True)
        _, features_mpiio = load_dataset(module="MPIIO", remove_runtime=True)
        df_both, features_both = load_dataset(module="both",
                                              remove_runtime=True)

        mpiio_train, mpiio_test = train_on_split(df_both, features_mpiio)
        both_train, both_test = train_on_split(df_both, features_both)

        return mpiio_train, mpiio_test, both_train, both_test

    def get_cobalt_results(multiple_allocations='ignore'):
        df, features = load_dataset(module="POSIX", remove_runtime=True)
        cobalt = pd.read_csv("data/cobalt_theta_2017_2020.csv")

        features_cobalt = features + [
            "NODES_USED_LOG10", "USED_CORE_HOURS_LOG10"
        ]

        if multiple_allocations == 'ignore':
            alloc_sizes = df.groupby(["JOBID"]).size()
            df = df[df.JOBID.isin(alloc_sizes[alloc_sizes == 1].index)]
        df = pd.merge(df, cobalt, left_on=["JOBID"], right_on=["COBALT_JOBID"])

        df["NODES_USED_LOG10"] = np.log10(df.NODES_USED)
        df["USED_CORE_HOURS_LOG10"] = np.log10(df.USED_CORE_HOURS)

        cobalt_train, cobalt_test = train_on_split(df, features_cobalt)

        return cobalt_train, cobalt_test

    posix_train, posix_test, runtime_train, runtime_test = get_runtime_results(
    )
    mpiio_train, mpiio_test, both_train, both_test = get_mpiio_results()
    cobalt_train, cobalt_test = get_cobalt_results()

    # posix_train, posix_test, runtime_train, runtime_test, mpiio_train, mpiio_test, both_train, both_test, cobalt_train, cobalt_test

    results = pd.DataFrame({
        'error':
        np.concatenate([
            posix_train, posix_test, runtime_train, runtime_test, mpiio_train,
            mpiio_test, both_train, both_test, cobalt_train, cobalt_test
        ]),
        'set':
        ['train'] * posix_train.shape[0] + ['test'] * posix_test.shape[0] +
        ['train'] * runtime_train.shape[0] + ['test'] * runtime_test.shape[0] +
        ['train'] * mpiio_train.shape[0] + ['test'] * mpiio_test.shape[0] +
        ['train'] * both_train.shape[0] + ['test'] * both_test.shape[0] +
        ['train'] * cobalt_train.shape[0] + ['test'] * cobalt_test.shape[0],
        'type': ['posix'] * (posix_train.shape[0] + posix_test.shape[0]) +
        ['runtime'] * (runtime_train.shape[0] + runtime_test.shape[0]) +
        ['mpiio'] * (mpiio_train.shape[0] + mpiio_test.shape[0]) + ['both'] *
        (both_train.shape[0] + both_test.shape[0]) + ['cobalt'] *
        (cobalt_train.shape[0] + cobalt_test.shape[0])
    })

    # Problems with log axes make me have to modify the data
    results.error = np.log10(results.error)

    #
    # Plotting
    #
    dx = 'type'
    dy = 'error'
    pal = "tab10"
    ort = 'v'

    df = results[results.error < np.log10(2)]

    plt.figure(figsize=(1.65 * 2, 2))

    def sample_type_equally(df, sample):
        """
        Given multiple types, makes sure each has equal representation
        """
        types = set(df.type)

        dfs = []
        for type in types:
            dfs.append(df[df.type == type].sample(sample))

        return pd.concat(dfs)

    #
    # Top figure
    #
    ax = plt.subplot(211)

    pt.half_violinplot(x=dx,
                       y=dy,
                       data=df[df.set == 'train'],
                       palette=pal,
                       bw=.1,
                       cut=0.,
                       scale="width",
                       width=1.,
                       inner=None,
                       orient=ort,
                       linewidth=0.8,
                       offset=0.2)

    sns.stripplot(x=dx,
                  y=dy,
                  data=sample_type_equally(df[df.set == 'train'], 500),
                  palette=pal,
                  edgecolor="white",
                  size=1,
                  jitter=1,
                  zorder=1,
                  orient=ort,
                  alpha=0.5)

    sns.boxplot(x=dx,
                y=dy,
                data=df[df.set == 'train'],
                color="black",
                width=.2,
                zorder=10,
                showcaps=True,
                boxprops={
                    'facecolor': 'none',
                    "zorder": 10
                },
                showfliers=True,
                whiskerprops={
                    'linewidth': 1,
                    "zorder": 10
                },
                saturation=1,
                orient=ort,
                fliersize=0,
                linewidth=1)

    yticks = [1, 1.2, 1.5, 2]
    ax.set_yticks(np.log10(yticks))
    ax.set_yticklabels([r"{:.2f} $\times$".format(y) for y in yticks])
    plt.ylabel("Absolute Error")

    plt.xlim(-0.8, 4.3)
    plt.xticks([], [])
    plt.xlabel("")
    ax.set_title("Training set")

    #
    # Second figure
    #
    plt.subplot(212)

    ax = pt.half_violinplot(x=dx,
                            y=dy,
                            data=df[df.set == 'test'],
                            palette=pal,
                            bw=.1,
                            cut=0.,
                            scale="width",
                            width=1.,
                            inner=None,
                            orient=ort,
                            linewidth=0.8,
                            offset=0.2)

    ax = sns.stripplot(x=dx,
                       y=dy,
                       data=sample_type_equally(df[df.set == 'test'], 500),
                       palette=pal,
                       edgecolor="white",
                       size=1,
                       jitter=1,
                       zorder=1,
                       orient=ort,
                       alpha=0.5)

    ax = sns.boxplot(x=dx,
                     y=dy,
                     data=df[df.set == 'test'],
                     color="black",
                     width=.2,
                     zorder=10,
                     showcaps=True,
                     boxprops={
                         'facecolor': 'none',
                         "zorder": 10
                     },
                     showfliers=True,
                     whiskerprops={
                         'linewidth': 1,
                         "zorder": 10
                     },
                     saturation=1,
                     orient=ort,
                     fliersize=0,
                     linewidth=1)

    yticks = [1, 1.2, 1.5, 2]
    ax.set_yticks(np.log10(yticks))
    ax.set_yticklabels([r"{:.2f} $\times$".format(y) for y in yticks])
    plt.ylabel("Absolute Error")

    plt.xlim(-0.8, 4.3)
    # plt.xticks(range(5), ["POSIX", "POSIX+runtime", "MPI-IO", "POSIX+MPI-IO", "POSIX+Cobalt"], rotation=30)
    plt.xticks(np.arange(5) - 0.5, [
        "POSIX\nTest set median={:.2f}$\\times$".format(
            10**results[(results.set == 'test')
                        & (results.type == 'posix')].median().item()),
        "POSIX+runtime\nTest set median={:.2f}$\\times$".format(
            10**results[(results.set == 'test')
                        & (results.type == 'runtime')].median().item()),
        "MPI-IO\nTest set median={:.2f}$\\times$".format(
            10**results[(results.set == 'test')
                        & (results.type == 'mpiio')].median().item()),
        "POSIX+MPI-IO\nTest set median={:.2f}$\\times$".format(
            10**results[(results.set == 'test')
                        & (results.type == 'both')].median().item()),
        "POSIX+Cobalt\nTest set median={:.2f}$\\times$".format(
            10**results[(results.set == 'test')
                        & (results.type == 'cobalt')].median().item())
    ],
               rotation=30,
               ha='right')
    plt.xlabel("")
    ax.set_title("Test set")

    plt.savefig("figures/figure_2.pdf", dpi=600, bbox_inches='tight')
示例#8
0
ylabels = [ylabel_map[i] for i in order]

# %% Plot

with sns.plotting_context("paper", font_scale=1.3):
    fig, ax = plt.subplots(figsize=(LETTER_WIDTH_INCH, 5))

    palette = "colorblind"

    ptitprince.half_violinplot(
        x="device",
        order=order,
        y="latency_ms",
        hue="os",
        hue_order=["Linux", "Windows"],
        data=df,
        ax=ax,
        palette=palette,
        split=True,
        inner=None,
        offset=0.3,
    )

    for i in ax.collections:
        i.set_alpha(0.65)

    sns.stripplot(
        x="device",
        order=order,
        y="latency_ms",
        hue="os",
示例#9
0
def plot_violins(df, count):
    """
    Given I/O throughputs and predictions, plots violins of errors for different applications.
    """
    top_apps = [c[0] for c in Counter(df.app).most_common()[:count]]
    df = df[df.app.isin(top_apps)]

    dx = 'app'
    dy = 'error'
    pal = "tab10"
    ort = "v"

    plt.figure(figsize=(1.65, 1.65))

    import ptitprince as pt
    ax = pt.half_violinplot(x=dx,
                            y=dy,
                            data=df,
                            palette=pal,
                            bw=.1,
                            cut=0.,
                            scale="width",
                            width=1.,
                            inner=None,
                            orient=ort,
                            linewidth=0.8,
                            offset=0.2)
    ax = sns.stripplot(x=dx,
                       y=dy,
                       data=sample_apps_equally(df, 1000),
                       palette=pal,
                       edgecolor="white",
                       size=1,
                       jitter=1,
                       zorder=1,
                       orient=ort,
                       alpha=0.5)
    ax = sns.boxplot(x=dx,
                     y=dy,
                     data=df,
                     color="black",
                     width=.2,
                     zorder=10,
                     showcaps=True,
                     boxprops={
                         'facecolor': 'none',
                         "zorder": 10
                     },
                     showfliers=True,
                     whiskerprops={
                         'linewidth': 1,
                         "zorder": 10
                     },
                     saturation=1,
                     orient=ort,
                     fliersize=0,
                     linewidth=1)

    yticks = [1 / 2, 1 / 1.5, 1 / 1.2, 1, 1.2, 1.5, 2]
    ax.set_yticks(np.log10(yticks))
    ax.set_yticklabels([r"{:.2f} $\times$".format(y) for y in yticks])
    ax.set_xticklabels(["Writer", "pw.x", "HACC", "IOR", "QB"], rotation=30)
    ax.set_xlabel("Application")
    ax.set_ylabel("Error")
    ax.set_axisbelow(True)

    plt.xlim(-0.8, 4.3)
    plt.ylim(np.log10(1 / 2), np.log10(2))

    plt.savefig("figures/figure_1b.pdf", dpi=600, bbox_inches='tight')
示例#10
0
# ################################################################
# Parameters plot
##################################################################

fig, ax = plt.subplots(1, 4, figsize=(8, 5))
pal = sns.color_palette("deep", 5)
labels = [r'$\omega_2$', r'$\beta_0$', r'$\beta_1$', r'$\zeta$']
for idx, var in enumerate(['om_2', 'be0', 'be1', 'ze']):

    data_param = data.groupby(['sub'])[var].mean().reset_index()

    dplot = data_param.melt(['sub'])

    pt.half_violinplot(x='variable', y="value", data=dplot, inner=None,
                       jitter=True, color=pal[idx], lwidth=0, width=0.6,
                       offset=0.17, cut=1, ax=ax[idx],
                       linewidth=1, alpha=0.6, zorder=19)
    sns.stripplot(x='variable', y="value", data=dplot,
                  jitter=0.08, ax=ax[idx],
                  linewidth=1, alpha=0.6, color=pal[idx], zorder=1)
    sns.boxplot(x='variable', y="value", data=dplot,
                color=pal[idx], whis=np.inf, linewidth=1, ax=ax[idx],
                width=0.1, boxprops={"zorder": 10, 'alpha': 0.5},
                whiskerprops={'zorder': 10, 'alpha': 1},
                medianprops={'zorder': 11, 'alpha': 0.5})
    ax[idx].set_xticklabels([labels[idx]], fontsize=param['labelfontsize'])
    if idx == 0:
        ax[idx].set_ylabel('Value', fontsize=param['labelfontsize'])
    else:
        ax[idx].set_ylabel('')
    ax[idx].set_xlabel('')
示例#11
0
       
    fs = []
    for k in range(1):
        for i in ['fc','fs_5dis','fs_6dis']:
            fs+=[i for j in range(fs_ref.shape[1])]
        
    ref = []
    for k in ['Rep_50']:
        ref += [k for i in range(3*fs_ref.shape[1])]
    
    data = pd.DataFrame({'icc':icc, 'icc_msr':icc_msr, 'icc_mse':icc_mse, 'fs':fs, 'ref': ref})
        
    plt.figure(figsize=(20, 10))
    sns.pointplot(x="ref", y="icc_msr", data=data, hue= 'fs', dodge=0.53, join=False, palette="dark",markers="d", scale=.75, ci='sd',capsize = 0.07)
    sns.stripplot(x="ref", y="icc_msr", data=data, hue= 'fs', size = 3, dodge=0.45, alpha = 0.05).set_title('Edge-wise ICC MSr')
    pt.half_violinplot(x="ref", y="icc_msr", data=data, hue= 'fs',scale = "area",inner = None, offset = 0.03, saturation=0.5)
    plt.legend(ncol=2)       
    plt.savefig(plotd+'icc_msr.png')######
    plt.close()

    plt.figure(figsize=(20, 10))
    sns.pointplot(x="ref", y="icc_mse", data=data, hue= 'fs', dodge=0.53, join=False, palette="dark",markers="d", scale=.75, ci='sd',capsize = 0.07)
    sns.stripplot(x="ref", y="icc_mse", data=data, hue= 'fs', size = 3, dodge=0.45, alpha = 0.05).set_title('Edge-wise ICC MSe')
    pt.half_violinplot(x="ref", y="icc_mse", data=data, hue= 'fs',scale = "area",inner = None, offset = 0.03, saturation=0.5)

    t1,p1 = stats.ttest_ind(icc_mse[0:int(len(icc_mse)/3)],icc_mse[int(len(icc_mse)/3):int(len(icc_mse)/3*2)], nan_policy ='omit', equal_var=False)
    t2,p2 = stats.ttest_ind(icc_mse[0:int(len(icc_mse)/3)],icc_mse[int(len(icc_mse)/3*2):], nan_policy ='omit', equal_var=False)

    plt.text(-0.15,0,'T: '+str(round(t1,5))+'\n'+'P: '+str(round(p1,5)),fontsize=18)
    plt.text(0.15,0,'T: '+str(round(t2,5))+'\n'+'P: '+str(round(p2,5)),fontsize=18)
    plt.legend(ncol=2)
示例#12
0
                   width_viol=.7,
                   ax=ax4,
                   orient=ort,
                   alpha=.65,
                   dodge=True,
                   move=.2)
ax4.get_legend().remove()
ax4.set(yscale="log")

fig.savefig(f"{figureRoot}/Pro.AnnotationsVsOthers.all.pdf")

ax = pt.half_violinplot(x=dx,
                        y="SignalVEH.+",
                        data=df,
                        palette=pal,
                        bw=.2,
                        cut=0.,
                        scale="area",
                        width=.6,
                        inner=None,
                        orient=ort)

ax = sns.stripplot(x=dx,
                   y=dy,
                   data=df,
                   palette=pal,
                   edgecolor="white",
                   size=3,
                   jitter=1,
                   zorder=0,
                   orient=ort)
示例#13
0
    def plot_violins(df):
        df = df[df.set != "train"]
        df = df[df.error < np.log10(10)]

        dx = 'type'
        dy = 'error'
        pal = "tab10"
        ort = 'v'

        #
        # First graph
        #
        plt.figure(figsize=(1.65 * 2, 1.4))
        plt.subplots_adjust(wspace=0.05, left=0, right=1)

        plt.subplot(121)

        ax = pt.half_violinplot(x=dx,
                                y=dy,
                                data=df[df.set == 'test'],
                                palette=pal,
                                bw=.1,
                                cut=0.,
                                scale="width",
                                width=1.,
                                inner=None,
                                orient=ort,
                                linewidth=0.8,
                                offset=0.2)

        ax = sns.stripplot(x=dx,
                           y=dy,
                           data=df[df.set == 'test'].sample(2000),
                           palette=pal,
                           edgecolor="white",
                           size=1,
                           jitter=1,
                           zorder=1,
                           orient=ort,
                           alpha=0.5)

        ax = sns.boxplot(x=dx,
                         y=dy,
                         data=df[df.set == 'test'],
                         color="black",
                         width=.2,
                         zorder=10,
                         showcaps=True,
                         boxprops={
                             'facecolor': 'none',
                             "zorder": 10
                         },
                         showfliers=True,
                         whiskerprops={
                             'linewidth': 1,
                             "zorder": 10
                         },
                         saturation=1,
                         orient=ort,
                         fliersize=0,
                         linewidth=1)

        plt.xlim(-0.8, 1.3)
        ax.set_xticklabels([
            "Baseline\nMedian={:.2f}$\\times$".format(
                10**df[(df.set == 'test')
                       & (df.type == 'baseline')].error.median()),
            "Normalized\nMedian={:.2f}$\\times$".format(
                10**df[(df.set == 'test')
                       & (df.type == 'normalized')].error.median())
        ])
        ax.set_xlabel("")

        yticks = [1, 1.2, 1.5, 2, 5, 10]
        ax.set_yticks(np.log10(yticks))
        ax.set_yticklabels([r"{:.2f} $\times$".format(y) for y in yticks])
        plt.ylabel("Absolute Error")

        ax.set_title("Test set")

        #
        # Second graph
        #
        plt.subplot(122)

        ax = pt.half_violinplot(x=dx,
                                y=dy,
                                data=df[df.set == '2020'],
                                palette=pal,
                                bw=.1,
                                cut=0.,
                                scale="width",
                                width=1.,
                                inner=None,
                                orient=ort,
                                linewidth=0.8,
                                offset=0.2)

        ax = sns.stripplot(x=dx,
                           y=dy,
                           data=df[df.set == '2020'].sample(2000),
                           palette=pal,
                           edgecolor="white",
                           size=1,
                           jitter=1,
                           zorder=1,
                           orient=ort,
                           alpha=0.5)

        ax = sns.boxplot(x=dx,
                         y=dy,
                         data=df[df.set == '2020'],
                         color="black",
                         width=.2,
                         zorder=10,
                         showcaps=True,
                         boxprops={
                             'facecolor': 'none',
                             "zorder": 10
                         },
                         showfliers=True,
                         whiskerprops={
                             'linewidth': 1,
                             "zorder": 10
                         },
                         saturation=1,
                         orient=ort,
                         fliersize=0,
                         linewidth=1)

        ax.set_xticklabels([
            "Baseline\nMedian={:.2f}$\\times$".format(
                10**df[(df.set == '2020')
                       & (df.type == 'baseline')].error.median()),
            "Normalized\nMedian={:.2f}$\\times$".format(
                10**df[(df.set == '2020')
                       & (df.type == 'normalized')].error.median())
        ])
        ax.set_xlabel("")
        plt.xlim(-0.8, 1.3)

        ax.set_yticks(np.log10(yticks))
        ax.set_yticklabels(["" for y in yticks])
        plt.ylabel("")

        ax.set_title("2020 set")

        plt.savefig("figures/figure_8.pdf", dpi=600, bbox_inches='tight')