Пример #1
0
def test_H_observed_EC2_variants():
    """Illustrate the variants of H_observed"""
    print(
        "\n\n-- test_H_observed_EC2_variants(): 'H_observed', 'M_observed', uses: 'planted_distribution_model_H' --"
    )

    # --- Parameters for graph
    n = 3000
    a = 1
    h = 8
    d = 2
    k = 3
    f = 0.2
    distribution = 'uniform'

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=None,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)
    X1, _ = replace_fraction_of_rows(X0, f, avoidNeighbors=False)

    # --- Print first rows of matrices
    distance = 3

    print("First rows of powers of H0:")
    for k in range(1, distance + 1):
        print("{}: {}".format(k, np.linalg.matrix_power(H0, k)[0]))

    print("\nNumber of observed edges between labels (M_observed):")
    M = M_observed(W, X1, distance=distance, NB=True)
    print("M[0]:\n{}".format(M[0]))
    print("M[2]:\n{}".format(M[1]))

    for EC in [False, True]:
        for variant in [1, 2]:
            print("\nP (H observed): variant {} with EC={}".format(
                variant, EC))
            H_vec = H_observed(W,
                               X1,
                               distance=distance,
                               NB=EC,
                               variant=variant)
            for i, H in enumerate(H_vec):
                print("{}:\n{}".format(i, H))
def run(choice, variant, create_data=False, show_plot=False, create_pdf=False, show_pdf=False, append_data=False):
    """main parameterized method to produce all figures.
    Can be run from external jupyther notebook or method to produce all figures, optionally as PDF
    CHOICE uses a different saved experimental run
    VARIANT uses a different wayt o plot
    """

    # %% -- Setup
    CREATE_DATA = create_data
    APPEND_DATA = append_data   # allows to add more data, requires CREATE_DATA to be true
    CHOICE = choice
    VARIANT = variant
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf
    SHOW_PDF = show_pdf
    BOTH = True  # show both figures for W and H
    SHOW_TITLE = True  # show parameters in title of plot
    f = 1  # fraction of labeled nodes for H estimation

    csv_filename = 'Fig_Scaling_Hrow_{}.csv'.format(CHOICE)
    fig_filename = 'Fig_Scaling_Hrow_{}-{}.pdf'.format(CHOICE, VARIANT)

    plot_colors = ['darkorange', 'blue']
    header = ['currenttime',
              'choice',  # W, or H
              'l',
              'time']
    if CREATE_DATA and not APPEND_DATA:
        save_csv_record(join(data_directory, csv_filename), header, append=APPEND_DATA)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(seed=RANDOMSEED)  # seeds the actually used numpy random generator; both are used and thus needed

    # %% -- Default parameters
    n = 10000
    ymax = 10
    h = 3
    d = 10  # actual degree is double
    distribution = 'uniform'
    exponent = None

    # %% -- CHOICES and VARIANTS
    if CHOICE == 1:
        W_repeat = [0, 0, 30, 5, 3, 1]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 4.3
        W_annotate_y = 1
        H_annotate_x = 6
        H_annotate_y = 0.005

    elif CHOICE == 2:  # small exponent 3, does not show the advantage well
        d = 3
        W_repeat = [0, 0, 10, 5, 5, 5, 5, 5, 5]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 5
        W_annotate_y = 0.08
        H_annotate_x = 6.5
        H_annotate_y = 0.004

    elif CHOICE == 3:  # small exponent 2, does not show the advantage well
        d = 2
        W_repeat = [0, 0, 50, 50, 50, 50, 50, 50, 50]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 6.5
        W_annotate_y = 0.02
        H_annotate_x = 6.5
        H_annotate_y = 0.004

    elif CHOICE == 4:
        distribution = 'powerlaw'
        exponent = -0.5
        W_repeat = [0, 0, 50, 9, 5, 3]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 4
        W_annotate_y = 1
        H_annotate_x = 6.5
        H_annotate_y = 0.006

        if VARIANT == 1:
            plot_colors = ['blue', 'darkorange']
            SHOW_TITLE = False

        if VARIANT == 2:
            plot_colors = ['blue', 'darkorange']
            BOTH = False
            SHOW_TITLE = False

    elif CHOICE == 5:
        distribution = 'powerlaw'
        exponent = -0.5
        W_repeat = [0, 0, 1, 1]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0] + [1] * 8
        W_annotate_x = 4
        W_annotate_y = 1
        H_annotate_x = 6.5
        H_annotate_y = 0.006

    elif CHOICE == 11:
        W_repeat = [0, 0, 1, 1, 0, 0]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 4.3
        W_annotate_y = 1
        H_annotate_x = 6
        H_annotate_y = 0.005

    elif CHOICE == 12:
        W_repeat = [0, 0, 31, 11, 5, 3, 3, 3, 3]  # index starts with 0. useful only for W^2 and later
        H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]
        W_annotate_x = 4.3
        W_annotate_y = 2.5
        H_annotate_x = 5.5
        H_annotate_y = 0.004
        f = 0.1
        plot_colors = ['blue', 'darkorange']
        ymax = 100

        if VARIANT == 1:    # TODO: when trying to add additional data, then it creates 7 instead of 4 rows,
                            # but the same code idea of CREATE vs ADD data appears to work in Fig_MHE_Optimal_Lambda, for that to replicate run below
                            # run(12, 1, create_pdf=True, show_pdf=True, create_data=False, append_data=True)
            W_repeat = [0, 0, 0, 0, 0, 0, 0, 0, 0]  # index starts with 0. useful only for W^2 and later
            H_repeat = [0, 50, 50, 50, 50, 50, 50, 50, 50]

    else:
        raise Warning("Incorrect choice!")

    # %% -- Create data
    if CREATE_DATA or APPEND_DATA:

        # Create graph
        k = 3
        a = 1
        alpha0 = np.array([a, 1., 1.])
        alpha0 = alpha0 / np.sum(alpha0)
        H0 = create_parameterized_H(k, h, symmetric=True)
        start = time.time()
        W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d,
                                             distribution=distribution,
                                             exponent=exponent,
                                             directed=False,
                                             debug=False)
        X0 = from_dictionary_beliefs(Xd)
        time_calc = time.time() - start
        # print("\nTime for graph:{}".format(time_calc))
        # print("Average outdegree: {}".format(calculate_average_outdegree_from_graph(W)))

        # Calculations W
        for length, rep in enumerate(W_repeat):

            for _ in range(rep):
                start = time.time()
                if length == 2:
                    result = W.dot(W)
                elif length == 3:
                    result = W.dot(W.dot(W))  # naive enumeration used as nothing can be faster
                elif length == 4:
                    result = W.dot(W.dot(W.dot(W)))
                elif length == 5:
                    result = W.dot(W.dot(W.dot(W.dot(W))))
                elif length == 6:
                    result = W.dot(W.dot(W.dot(W.dot(W.dot(W)))))
                elif length == 7:
                    result = W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W))))))
                elif length == 8:
                    result = W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W)))))))
                elif length == 9:
                    result = W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W.dot(W))))))))
                time_calc = time.time() - start

                tuple = [str(datetime.datetime.now())]
                text = ['W',
                        length,
                        time_calc]
                text = np.asarray(text)  # without np, entries get ugly format
                tuple.extend(text)
                # print("W, d: {}, time: {}".format(length, time_calc))
                save_csv_record(join(data_directory, csv_filename), tuple)

        # Calculations H_NB
        for length, rep in enumerate(H_repeat):

            for _ in range(rep):
                X0 = from_dictionary_beliefs(Xd)
                X1, ind = replace_fraction_of_rows(X0, 1 - f)

                start = time.time()
                result = H_observed(W, X=X1, distance=length, NB=True, variant=1)
                time_calc = time.time() - start

                tuple = [str(datetime.datetime.now())]
                text = ['H',
                        length,
                        time_calc]
                text = np.asarray(text)  # without np, entries get ugly format
                tuple.extend(text)
                # print("H, d: {}, time: {}".format(length, time_calc))
                save_csv_record(join(data_directory, csv_filename), tuple)

        # Calculate and display M statistics
        for length, _ in enumerate(H_repeat):
            M = M_observed(W, X=X0, distance=length, NB=True)
            M = M[-1]
            s = np.sum(M)
            # print("l: {}, sum: {:e}, M:\n{}".format(length, s, M))

    # %% -- Read, aggregate, and pivot data
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1 (length {}):\n{}".format(len(df1.index), df1.head(15)))
    df2 = df1.groupby(['choice', 'l']).agg \
        ({'time': [np.max, np.mean, np.median, np.min, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(30)))
    df3 = pd.pivot_table(df2, index=['l'], columns=['choice'], values='time_median', )  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))

    #%% -- Setup figure
    mpl.rcParams['backend'] = 'pdf'
    mpl.rcParams['lines.linewidth'] = 3
    mpl.rcParams['font.size'] = 20
    mpl.rcParams['axes.labelsize'] = 20
    mpl.rcParams['axes.titlesize'] = 16
    mpl.rcParams['xtick.labelsize'] = 16
    mpl.rcParams['ytick.labelsize'] = 16
    mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
    mpl.rcParams['grid.color'] = '777777'  # grid color
    mpl.rcParams['figure.figsize'] = [4, 4]
    mpl.rcParams['xtick.major.pad'] = 6  # padding of tick labels: default = 4
    mpl.rcParams['ytick.major.pad'] = 4  # padding of tick labels: default = 4
    fig = plt.figure()
    ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

    #%% -- Draw the plot and annotate
    df4 = df3['H']
    # print("\n-- df4 (length {}):\n{}".format(len(df4.index), df4.head(30)))

    Y1 = df3['W'].plot(logy=True, color=plot_colors[0], marker='o', legend=None,
                       clip_on=False,  # cut off data points outside of plot area
                       # zorder=3
                       )  # style='o', kind='bar', style='o-',

    plt.annotate(r'$\mathbf{W}^\ell$',
                 xy=(W_annotate_x, W_annotate_y),
                 color=plot_colors[0],
                 )

    if BOTH:
        Y2 = df3['H'].plot(logy=True, color=plot_colors[1], marker='o', legend=None,
                           clip_on=False,  # cut off data points outside of plot area
                           zorder=3
                           )  # style='o', kind='bar', style='o-',

        plt.annotate(r'$\mathbf{\hat P}_{\mathrm{NB}}^{(\ell)}$',
                     xy=(H_annotate_x, H_annotate_y),
                     color=plot_colors[1],
                     )
    if SHOW_TITLE:
        plt.title(r'$\!\!\!\!n\!=\!{}\mathrm{{k}}, d\!=\!{}, h\!=\!{}, f\!=\!{}$'.format(int(n / 1000), 2 * d, h, f))

    # %% -- Figure settings & plot
    plt.grid(b=True, which='both', alpha=0.2, linestyle='solid', axis='y', linewidth=0.5)  # linestyle='dashed', which='minor'
    plt.xlabel(r'Path length ($\ell$)', labelpad=0)
    plt.ylabel(r'$\!$Time [sec]', labelpad=1)
    plt.ylim(0.001, ymax)  # placed after yticks
    plt.xticks(range(1, 9))

    if SHOW_PLOT:
        plt.show()
    if CREATE_PDF:
        plt.savefig(join(figure_directory, fig_filename), format='pdf',
                    dpi=None,
                    edgecolor='w',
                    orientation='portrait',
                    transparent=False,
                    bbox_inches='tight',
                    pad_inches=0.05,
                    # frameon=None
                    )
    if SHOW_PDF:
        # os.system('{} "'.format(open_cmd[sys.platform]) + join(figure_directory, fig_filename) + '"')       # shows actually created PDF
        showfig(join(figure_directory, fig_filename))  # shows actually created PDF       # TODO replace with this method
Пример #3
0
def run(choice,
        variant,
        create_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False):
    """main parameterized method to produce all figures.
    Can be run from external jupyther notebook or method to produce all figures in PDF
    """

    # %% -- Setup
    CREATE_DATA = create_data
    CHOICE = choice
    VARIANT = variant
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf
    SHOW_PDF = show_pdf
    SHOW_TITLE = True
    LEGEND_MATCH_COLORS = False
    SHOW_DISTRIBUTION_IN_TITLE = True

    SHOW_BACKTRACK_ESTIMATE = True
    SHOW_NONBACKTRACK_ESTIMATE = True
    plot_colors = ['darkgreen', 'darkorange', 'blue']
    label_vec = [
        r'$\mathbf{H}^{\ell}\,\,\,\,$', r'$\mathbf{\hat P}^{(\ell)}$',
        r'$\mathbf{\hat P}_{\mathrm{NB}}^{(\ell)}$'
    ]

    csv_filename = 'Fig_Backtracking_Advantage_{}.csv'.format(CHOICE)
    fig_filename = 'Fig_Backtracking_Advantage_{}-{}.pdf'.format(
        CHOICE, VARIANT)

    header = [
        'currenttime',
        'choice',  # H, Hrow, HrowEC
        'l',
        'valueH',  # maximal values in first row of H
        'valueM'
    ]  # average value across entries in M
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # %% -- Default parameters
    ymin = 0.3
    ymax = 1
    exponent = None

    # %% -- CHOICES and VARIANTS
    if CHOICE == 1:  # n=1000, shows NB to be slight lower for l=2: probably due to sampling issues (d=3, thus very few points available)
        n = 1000
        h = 8
        d = 3
        f = 0.1
        distribution = 'uniform'
        rep = 10000
        length = 8

    elif CHOICE == 2:
        n = 1000
        h = 8
        d = 10
        f = 0.1
        distribution = 'uniform'
        rep = 10000
        length = 8

    elif CHOICE == 3:  # nice: shows nicely that difference is even bigger for smaller h
        n = 1000
        h = 3
        d = 10
        f = 0.1
        distribution = 'uniform'
        rep = 10000
        length = 8
        ymax = 0.8

    elif CHOICE == 4:
        n = 10000
        h = 3
        d = 10
        f = 0.1
        distribution = 'uniform'
        rep = 100
        length = 8
        ymin = 0.333
        ymax = 0.65

    elif CHOICE == 5:
        n = 10000
        h = 3
        d = 3
        f = 0.1
        distribution = 'uniform'
        rep = 1000
        length = 8

    elif CHOICE == 6:  # n=1000, the powerlaw problem with small graphs and high exponent
        n = 1000
        h = 8
        d = 3
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.5
        rep = 10000
        length = 8

    elif CHOICE == 7:
        n = 10000
        h = 8
        d = 3
        f = 0.1
        distribution = 'uniform'
        rep = 1000
        length = 8
        # ymin = 0.4
        ymax = 1

    elif CHOICE == 8:
        n = 10000
        h = 8
        d = 10
        f = 0.1
        distribution = 'uniform'
        rep = 1000
        length = 8
        # ymin = 0.4
        ymax = 1

    elif CHOICE == 9:  # shows lower NB due to problem with sampling from high powerlaw -0.5
        n = 10000
        h = 8
        d = 10
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.5
        rep = 1000
        length = 8

    elif CHOICE == 10:
        n = 10000
        h = 8
        d = 3
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.5
        rep = 1000
        length = 8

    elif CHOICE == 11:  # problem: shows that NB is too low (probably because of problem with sampling from -0.5 factor)
        n = 1000
        h = 8
        d = 10
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.5
        rep = 1000
        length = 8

    elif CHOICE == 12:  # problem: shows no problem with NB (probably because no problem with sampling from -0.2 factor)
        n = 1000
        h = 8
        d = 10
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.2
        rep = 1000
        length = 8

    elif CHOICE == 20:
        n = 10000
        h = 3
        d = 10
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.3
        rep = 1000
        length = 8
        ymin = 0.333
        ymax = 0.65

    elif CHOICE == 21:  # originally used before color change
        n = 10000
        h = 3
        d = 25
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.3
        rep = 1000
        length = 8
        ymin = 0.333
        ymax = 0.65

        if VARIANT == 1:
            SHOW_TITLE = False
            plot_colors = ['red', 'blue', 'darkorange']
            label_vec = [r'$\mathbf{H}^{\ell}\quad\quad$', 'naive', 'better']
            LEGEND_MATCH_COLORS = True

        if VARIANT == 2:
            SHOW_TITLE = False
            plot_colors = ['red', 'blue', 'darkorange']
            label_vec = [r'$\mathbf{H}^{\ell}\quad\quad$', 'naive', 'better']
            SHOW_NONBACKTRACK_ESTIMATE = False
            LEGEND_MATCH_COLORS = True

        if VARIANT == 3:
            SHOW_TITLE = False
            plot_colors = ['red', 'blue', 'darkorange']
            label_vec = [r'$\mathbf{H}^{\ell}\quad\quad$', 'naive', 'better']
            SHOW_BACKTRACK_ESTIMATE = False
            SHOW_NONBACKTRACK_ESTIMATE = False
            LEGEND_MATCH_COLORS = True

        if VARIANT == 4:
            plot_colors = ['red', 'blue', 'darkorange']
            LEGEND_MATCH_COLORS = True

    elif CHOICE == 25:
        n = 10000
        h = 8
        d = 5
        f = 0.1
        distribution = 'uniform'
        rep = 1000
        length = 8

    elif CHOICE == 26:
        n = 10000
        h = 8
        d = 25
        f = 0.1
        distribution = 'uniform'
        rep = 1000
        length = 8
        ymax = 0.9
        ymin = 0.4

    elif CHOICE == 27:
        n = 10000
        h = 8
        d = 10
        f = 0.1
        distribution = 'powerlaw'
        exponent = -0.3
        rep = 1000
        length = 8
        ymax = 0.9
        ymin = 0.33

    elif CHOICE == 31:
        n = 10000
        h = 3
        d = 10
        f = 0.1
        distribution = 'uniform'
        length = 8
        ymin = 0.333
        ymax = 0.65
        SHOW_DISTRIBUTION_IN_TITLE = False
        plot_colors = ['red', 'blue', 'darkorange']
        LEGEND_MATCH_COLORS = True

        if VARIANT == 0:
            rep = 1000

        if VARIANT == 1:
            rep = 20

    else:
        raise Warning("Incorrect choice!")

    k = 3
    a = 1
    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))

    # %% -- Create data
    if CREATE_DATA:

        # Calculations H
        print("Max entry of first rows of powers of H0:")
        for l in range(1, length + 1):
            valueH = np.max(np.linalg.matrix_power(H0, l)[0])

            tuple = [str(datetime.datetime.now())]
            text = ['H', l, valueH, '']
            text = np.asarray(text)  # without np, entries get ugly format
            tuple.extend(text)
            print("{}: {}".format(l, valueH))
            save_csv_record(join(data_directory, csv_filename), tuple)

        # Calculations Hrow and HrowEC
        for r in range(rep):
            print('Repetition {}'.format(r))

            # Create graph
            start = time.time()
            W, Xd = planted_distribution_model_H(
                n,
                alpha=alpha0,
                H=H0,
                d_out=
                d,  # notice that for undirected graphs, actual degree = 2*d
                distribution=distribution,
                exponent=exponent,
                directed=False,
                debug=False)
            X0 = from_dictionary_beliefs(Xd)
            X1, ind = replace_fraction_of_rows(X0, 1 - f)
            time_calc = time.time() - start
            # print("\nTime for graph:{}".format(time_calc))

            print("Average outdegree: {}".format(
                calculate_average_outdegree_from_graph(W)))

            # Calculate H_vec and M_vec versions (M_vec to calculate the average number of entries in M)
            H_vec = H_observed(W, X1, distance=length, NB=False, variant=1)
            H_vec_EC = H_observed(W, X1, distance=length, NB=True, variant=1)
            M_vec = M_observed(W, X1, distance=length, NB=False)
            M_vec_EC = M_observed(W, X1, distance=length, NB=True)

            # Calculation H_vec
            # print("Max entry of first rows of H_vec")
            for l, H in enumerate(H_vec):
                valueH = H[0][
                    (l + 1) %
                    2]  # better than 'value = np.max(H[0])', otherwise sometimes chooses another higher entry -> biased estimate
                valueM = np.average(M_vec[l + 1])
                # print(M_vec[l+1])
                # print(valueM)

                tuple = [str(datetime.datetime.now())]
                text = ['Hrow', l + 1, valueH, valueM]
                text = np.asarray(text)  # without np, entries get ugly format
                tuple.extend(text)
                # print("{}: {}".format(l + 1, value))
                save_csv_record(join(data_directory, csv_filename), tuple)

            # Calculation H_vec_EC
            # print("Max entry of first rows of H_vec_EC")
            for l, H in enumerate(H_vec_EC):
                valueH = H[0][(l + 1) % 2]
                valueM = np.average(M_vec_EC[l + 1])
                # print(M_vec_EC[l+1])
                # print(valueM)

                tuple = [str(datetime.datetime.now())]
                text = ['HrowEC', l + 1, valueH, valueM]
                text = np.asarray(text)  # without np, entries get ugly format
                tuple.extend(text)
                # print("{}: {}".format(l + 1, value))
                save_csv_record(join(data_directory, csv_filename), tuple)

    #%% -- Read, aggregate, and pivot data
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1 (length {}):\n{}".format(len(df1.index), df1.head(15)))
    df2 = df1.groupby(['choice', 'l']).agg \
        ({'valueH': [np.mean, np.std, np.size],  # Multiple Aggregates
          'valueM': [np.mean],
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'valueH_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(30)))
    df3 = pd.pivot_table(df2,
                         index=['l'],
                         columns=['choice'],
                         values=['valueH_mean', 'valueH_std',
                                 'valueM_mean'])  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    # df3.drop(['valueM_mean_H', 'valueH_std_H'], axis=1, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.reset_index(level=0, inplace=True)  # get l into columns
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))

    #%% -- Setup figure
    mpl.rcParams['backend'] = 'pdf'
    mpl.rcParams['lines.linewidth'] = 3
    mpl.rcParams['font.size'] = 16
    mpl.rcParams['axes.labelsize'] = 20
    mpl.rcParams['axes.titlesize'] = 16
    mpl.rcParams['xtick.labelsize'] = 16
    mpl.rcParams['ytick.labelsize'] = 16
    mpl.rcParams['legend.fontsize'] = 20
    mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
    mpl.rcParams['grid.color'] = '777777'  # grid color
    mpl.rcParams['figure.figsize'] = [4, 4]
    mpl.rcParams['xtick.major.pad'] = 4  # padding of tick labels: default = 4
    mpl.rcParams['ytick.major.pad'] = 4  # padding of tick labels: default = 4

    fig = plt.figure()
    ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

    #%% -- Extract values into columns (plotting dataframew with bars plus error lines and lines gave troubles)
    l_vec = df3['l'].values  # .tolist() does not work with bar plot
    mean_H_vec = df3['valueH_mean_H'].values
    mean_Hrow_vec = df3['valueH_mean_Hrow'].values
    mean_Hrow_vecEC = df3['valueH_mean_HrowEC'].values
    std_Hrow_vec = df3['valueH_std_Hrow'].values
    std_Hrow_vecEC = df3['valueH_std_HrowEC'].values

    #%% -- Draw the plot and annotate
    width = 0.3  # the width of the bars
    if SHOW_BACKTRACK_ESTIMATE:
        left_vec = l_vec
        if SHOW_NONBACKTRACK_ESTIMATE:
            left_vec = left_vec - width
        bar1 = ax.bar(
            left_vec,
            mean_Hrow_vec,
            width,
            color=plot_colors[1],
            yerr=std_Hrow_vec,
            error_kw={
                'ecolor': 'black',
                'linewidth': 2
            },  # error-bars colour
            label=label_vec[1])
    if SHOW_NONBACKTRACK_ESTIMATE:
        bar2 = ax.bar(
            l_vec,
            mean_Hrow_vecEC,
            width,
            color=plot_colors[2],
            yerr=std_Hrow_vecEC,
            error_kw={
                'ecolor': 'black',
                'linewidth': 2
            },  # error-bars colour
            label=label_vec[2])
    gt = ax.plot(l_vec,
                 mean_H_vec,
                 color=plot_colors[0],
                 linestyle='solid',
                 linewidth=2,
                 marker='o',
                 markersize=10,
                 markeredgewidth=2,
                 markerfacecolor='None',
                 markeredgecolor=plot_colors[0],
                 label=label_vec[0])

    if CHOICE == 4 or CHOICE == 20:
        ax.annotate(
            np.round(mean_Hrow_vec[1], 2),
            xy=(2.15, 0.65),
            xytext=(2.1, 0.60),
            arrowprops=dict(facecolor='black', arrowstyle="->"),
        )

    #%% -- Legend
    if distribution == 'uniform' and SHOW_DISTRIBUTION_IN_TITLE:
        distribution_label = ',$uniform'
    else:
        distribution_label = '$'
    if SHOW_TITLE:
        plt.title(
            r'$\!\!\!\!n\!=\!{}\mathrm{{k}}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.
            format(int(n / 1000), 2 * d, h, f, distribution_label
                   ))  # notice that actual d is double than in one direction

    handles, labels = ax.get_legend_handles_labels()
    legend = plt.legend(
        handles,
        labels,
        loc='upper right',
        handlelength=1.5,
        labelspacing=0,  # distance between label entries
        handletextpad=0.3,  # distance between label and the line representation
        # title='Iterations'
        borderaxespad=0.1,  # distance between legend and the outer axes
        borderpad=0.1,  # padding inside legend box
        numpoints=1,  # put the marker only once
    )

    if LEGEND_MATCH_COLORS:  # TODO: how to get back the nicer line spacing defined in legend above after changing the legend text colors
        legend.get_texts()[0].set_color(plot_colors[0])
        if SHOW_BACKTRACK_ESTIMATE:
            legend.get_texts()[1].set_color(plot_colors[1])
        if SHOW_NONBACKTRACK_ESTIMATE:
            legend.get_texts()[2].set_color(plot_colors[2])

    frame = legend.get_frame()
    frame.set_linewidth(0.0)
    frame.set_alpha(0.8)  # 0.8

    # %% -- Figure settings & plot
    ax.set_xticks(range(10))
    plt.grid(b=True,
             which='both',
             alpha=0.2,
             linestyle='solid',
             axis='y',
             linewidth=0.5)  # linestyle='dashed', which='minor'
    plt.xlabel(r'Path length ($\ell$)', labelpad=0)
    plt.ylim(ymin, ymax)  # placed after yticks
    plt.xlim(0.5, 5.5)
    plt.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom=
        'off',  # ticks along the bottom edge are off        TODO: Paul, this does not work anymore :(    1/26/2020
        top='off',  # ticks along the top edge are off
        # labelbottom='off',    # labels along the bottom edge are off
    )

    if CREATE_PDF:
        plt.savefig(
            join(figure_directory, fig_filename),
            format='pdf',
            dpi=None,
            edgecolor='w',
            orientation='portrait',
            transparent=False,
            bbox_inches='tight',
            pad_inches=0.05,
            # frameon=None
        )
    if SHOW_PDF:
        showfig(join(figure_directory, fig_filename))
    if SHOW_PLOT:
        plt.show()
Пример #4
0
def test_M_observed():
    """Illustrate M_observed: non-backtracking or not
    Also shows that W^2 is denser for powerlaw graphs than uniform
    """
    print(
        "\n-- test_M_observed(): 'M_observed', uses: 'planted_distribution_model_H' --"
    )

    # --- Parameters for graph
    n = 3000
    a = 1
    h = 8
    d = 10  # variant 2
    d = 2  # variant 1
    k = 3
    distribution = 'powerlaw'  # variant 2
    distribution = 'uniform'  # variant 1
    exponent = -0.5

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=exponent,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)

    # --- Print results
    distance = 8

    M_vec = M_observed(W, X0, distance=distance, NB=False)
    M_vec_EC = M_observed(W, X0, distance=distance, NB=True)

    print("Graph with n={} nodes and uniform d={} degrees".format(n, d))
    print("\nSum of entries and first rows of M_vec (without NB)")
    for i, M in enumerate(M_vec):  # M_vec[1:] to skip the first entry in list
        print("{}: {}, {}".format(i, np.sum(M), M[0]))

    print("\nSum of entries and first rows of M_vec (with NB)")
    for i, M in enumerate(M_vec_EC):
        print("{}: {}, {}".format(i, np.sum(M), M[0]))

    if True:
        print("\nFull matrices:")
        print("M_vec")
        for i, M in enumerate(M_vec):  # skip the first entry in list
            print("{}: \n{}".format(i, M))

        print("\nM_vec_EC")
        for i, M in enumerate(M_vec_EC):  # skip the first entry in list
            print("{}: \n{}".format(i, M))
Пример #5
0
def test_gradient_optimization2():
    print(
        "\n-- 'estimateH, define_gradient_energy_H, define_energy_H, uses: planted_distribution_model_H, H_observed, M_observed, --"
    )

    # --- Parameters for graph
    n = 10000
    a = 1
    h = 2
    d = 10
    k = 7
    distribution = 'powerlaw'
    exponent = -0.3

    np.set_printoptions(precision=4)

    alpha0 = create_parameterized_alpha(k, a)
    H0 = create_parameterized_H(k, h, symmetric=True)
    f = 0.02
    print("Graph n={}, d={}, f={}".format(n, d, f))
    print("H0:\n{}".format(H0))

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=exponent,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)
    X1, ind = replace_fraction_of_rows(X0, 1 - f)

    # --- M_vec, H_vec statistics
    distance = 5

    print("M_vec:")
    M_vec = M_observed(W, X1, distance=distance)
    for i, M in enumerate(M_vec):
        print("{}:\n{}".format(i, M))

    print("\nH_vec_observed:")
    H_vec = H_observed(W, X1, distance=distance)
    for i, H in enumerate(H_vec):
        print("{}:\n{}".format(i, H))

    # --- estimate_H based on distance 1 and uninformative point
    distance = 1
    weights = [1, 0, 0, 0, 0]
    print(
        "\n= Estimate H based on X1 and distance={} from uninformative point:".
        format(distance))
    h0 = np.ones(int(k * (k - 1) / 2)).dot(
        1 / k)  # use uninformative matrix to start with
    energy_H = define_energy_H(H_vec_observed=H_vec,
                               weights=weights,
                               distance=distance)
    gradient_energy_H = define_gradient_energy_H(H_vec_observed=H_vec,
                                                 weights=weights,
                                                 distance=distance)

    start = time.time()
    H1 = estimateH(X1, W, distance=distance, weights=weights, gradient=False)
    time_est = time.time() - start
    print("Estimated H without gradient:\n{}".format(H1))
    print("Time :{}".format(time_est))
    e = energy_H(H1)
    print("Energy at estimated point: {}".format(e))

    start = time.time()
    H2 = estimateH(X1, W, distance=distance, weights=weights, gradient=True)
    time_est = time.time() - start
    print("Estimated H with gradient:\n{}".format(H2))
    print("Time :{}".format(time_est))
    e = energy_H(H2)
    print("Energy at estimated point: {}".format(e))

    G = gradient_energy_H(H2)
    h = derivative_H_to_h(G)
    print("Gradient matrix at estimated point:\n{}".format(G))
    print("Gradient vector at estimated point:\n{}".format(h))
Пример #6
0
def test_gradient():
    print(
        "\n-- 'define_gradient_energy_H, define_energy_H, uses: planted_distribution_model_H, H_observed, M_observed, --"
    )

    # --- Parameters for graph
    n = 1000
    a = 1
    h = 8
    d = 25
    k = 3
    distribution = 'powerlaw'
    exponent = -0.3

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    f = 0.5
    print("Graph n={}, d={}, f={}".format(n, d, f))
    print("H0:\n{}\n".format(H0))

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=exponent,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)
    X1, ind = replace_fraction_of_rows(X0, 1 - f)

    # --- M_vec, H_vec statistics
    distance = 5

    print("M_vec:")
    M_vec = M_observed(W, X1, distance=distance)
    for i, M in enumerate(M_vec):
        print("{}:\n{}".format(i, M))

    print("H_vec:")
    H_vec = H_observed(W, X1, distance=distance)
    for i, H in enumerate(H_vec):
        print("{}:\n{}".format(i, H))

    # --- Gradient at multiple points for distance 1
    print("\n=Defining the gradient function with distance 1")
    distance = 1
    weights = [1, 0, 0, 0, 0]
    gradient_energy_H = define_gradient_energy_H(H_vec_observed=H_vec,
                                                 weights=weights,
                                                 distance=distance)
    energy_H = define_energy_H(weights=weights,
                               distance=1,
                               H_vec_observed=H_vec)

    H_actual = H_vec[0]
    print(
        "1st example point: H_actual (row-stochastic frequencies of neighbors):\n{}"
        .format(H_actual))
    e = energy_H(H_actual)
    g = gradient_energy_H(H_actual)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)

    H_point = transform_hToH(np.array([0.2, 0.6, 0.2]), 3)
    print("\n2nd example point: H_point:\n{}".format(H_point))
    e = energy_H(H_point)
    g = gradient_energy_H(H_point)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)

    H_point2 = H_point - 0.45 * g
    print(
        "\n3rd example point in opposite direction of gradient: H_point2=H_point-0.45*gradient:\n{}"
        .format(H_point2))
    e = energy_H(H_point2)
    g = gradient_energy_H(H_point2)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)

    # --- Gradient at multiple points for distance 5
    distance = 5
    weights = [0, 0, 0, 0, 1]
    print("\n= Defining the gradient function with distance={} and weights={}".
          format(distance, weights))
    gradient_energy_H = define_gradient_energy_H(H_vec_observed=H_vec,
                                                 weights=weights,
                                                 distance=distance)
    energy_H = define_energy_H(weights=weights,
                               distance=1,
                               H_vec_observed=H_vec)

    H_actual = H_vec[0]
    print("1st point: H_actual:\n{}".format(H_actual))
    e = energy_H(H_actual)
    g = gradient_energy_H(H_actual)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)

    H_point = transform_hToH(np.array([0.2, 0.6, 0.2]), 3)
    print("\n2nd point: H_point:\n{}".format(H_point))
    e = energy_H(H_point)
    g = gradient_energy_H(H_point)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)

    H_point2 = H_point - 1.5 * g
    print(
        "\n3rd point in opposite direction of gradient: H_point2:\n{}".format(
            H_point2))
    e = energy_H(H_point2)
    g = gradient_energy_H(H_point2)
    h = derivative_H_to_h(g)
    print("energy: ", e)
    print("gradient:\n{}".format(g))
    print("projected gradient: ", h)
Пример #7
0
def test_estimate_synthetic():
    print(
        "\n\n-- test_estimate_synthetic(): 'estimateH', uses: 'M_observed', 'planted_distribution_model_H', --"
    )

    # --- Parameters for graph
    n = 1000
    a = 1
    h = 8
    d = 25
    k = 3
    distribution = 'powerlaw'
    exponent = -0.3
    f = 0.05
    print("n={}, a={},d={}, f={}".format(n, a, d, f))

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    print("H0:\n{}".format(H0))

    # --- Create graph
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed

    W, Xd = planted_distribution_model_H(n,
                                         alpha=alpha0,
                                         H=H0,
                                         d_out=d,
                                         distribution=distribution,
                                         exponent=exponent,
                                         directed=False,
                                         debug=False)
    X0 = from_dictionary_beliefs(Xd)
    X1, ind = replace_fraction_of_rows(X0, 1 - f)

    # --- Print some neighbor statistics
    M_vec = M_observed(W, X0, distance=3, NB=True)
    print("\nNeighbor statistics in fully labeled graph:")
    print("M^(1): direct neighbors:\n{}".format(M_vec[1]))
    print("M^(2): distance-2 neighbors:\n{}".format(M_vec[2]))
    print("M^(3): distance-3 neighbors:\n{}".format(M_vec[3]))

    # --- MHE ---
    print("\nMHE: Estimate H based on X0 (fully labeled graph):")
    start = time.time()
    H1 = estimateH(X0, W, method='MHE', variant=1)
    H2 = estimateH(X0, W, method='MHE', variant=2)
    H3 = estimateH(X0, W, method='MHE', variant=3)
    time_est = time.time() - start
    print("Estimated H based on X0 (MHE), variant 1:\n{}".format(H1))
    print("Estimated H based on X0 (MHE), variant 2:\n{}".format(H2))
    print("Estimated H based on X0 (MHE), variant 3:\n{}".format(H3))
    print("Time for all three variants:{}".format(time_est))

    print("\nMHE: Estimate H based on X1 with f={}:".format(f))
    start = time.time()
    H1 = estimateH(X1, W, method='MHE', variant=1)
    H2 = estimateH(X1, W, method='MHE', variant=2)
    H3 = estimateH(X1, W, method='MHE', variant=3)
    time_est = time.time() - start
    print("Estimated H based on X1 (MHE), variant 1:\n{}".format(H1))
    print("Estimated H based on X1 (MHE), variant 2:\n{}".format(H2))
    print("Estimated H based on X1 (MHE), variant 3:\n{}".format(H3))
    print("Time for all three variants:{}".format(time_est))

    print(
        "\nMHE, variant=1: Estimate H based on X1 with f={}, but with initial correct vector:"
    )
    weight = [0, 0, 0, 0, 0]  # ignored for MHE
    initial_h0 = [0.1, 0.8, 0.1]
    H5 = estimateH(X1, W, method='MHE', weights=weight)
    H5_r = estimateH(X1, W, method='MHE', weights=weight, randomize=True)
    H5_i = estimateH(X1,
                     W,
                     method='MHE',
                     weights=weight,
                     initial_H0=transform_hToH(initial_h0, 3))
    print("Estimated H based on X5 only (MHE): \n{}".format(H5))
    print("Estimated H based on X5 only (MHE), randomize:\n{}".format(H5_r))
    print("Estimated H based on X5 only (MHE), initial=GT:\n{}".format(H5_i))

    # --- DHE ---
    print("\nDHE: Estimate H based on X1 with f={}:".format(f))
    start = time.time()
    H1 = estimateH(X1, W, method='DHE', variant=1, distance=1)
    H2 = estimateH(X1, W, method='DHE', variant=2, distance=1)
    H3 = estimateH(X1, W, method='DHE', variant=3, distance=1)
    time_est = time.time() - start
    print(
        "Estimated H based on X1 (DHE, distance=1), variant 1:\n{}".format(H1))
    print(
        "Estimated H based on X1 (DHE, distance=1), variant 2:\n{}".format(H2))
    print(
        "Estimated H based on X1 (DHE, distance=1), variant 3:\n{}".format(H3))
    print("Time for all three variants:{}".format(time_est))

    # --- LHE ---
    print("\nLHE: Estimate H based on X1 with f={}:".format(f))
    start = time.time()
    H1 = estimateH(X1, W, method='LHE')
    time_est = time.time() - start
    print("Estimated H based on X1 (LHE):\n{}".format(H1))
    print("Time for LHE:{}".format(time_est))

    # --- Baseline holdout method ---
    f2 = 0.5
    X2, ind2 = replace_fraction_of_rows(X0, 1 - f2)
    print("\nHoldout method: Estimate H based on X2 with f={}):".format(f2))
    start = time.time()
    H2 = estimateH_baseline_serial(X2=X2,
                                   ind=ind2,
                                   W=W,
                                   numberOfSplits=1,
                                   numMax=10)
    time_est = time.time() - start
    print("Estimated H based on X2 (Holdout method) with f={}:\n{}".format(
        f2, H2))
    print("Time for Holdout method:{}".format(
        time_est))  # TODO: result suggests this method does not work?
def run(choice,
        create_data=False,
        add_data=False,
        show_plot=False,
        create_pdf=False,
        show_pdf=False,
        show_fig=True):
    # -- Setup
    CHOICE = choice
    CREATE_DATA = create_data
    ADD_DATA = add_data
    SHOW_PLOT = show_plot
    CREATE_PDF = create_pdf
    SHOW_PDF = show_pdf
    SHOW_FIG1 = show_fig
    SHOW_FIG2 = show_fig

    csv_filename = 'Fig_MHE_Optimal_ScalingFactor_f_lambda10_{}.csv'.format(
        CHOICE)
    header = [
        'currenttime',
        'option',  # one option corresponds to one choice of weight vector. In practice, one choice of scaling factor (for weight vector)
        'f',
        'scaling',
        'diff'
    ]  # L2 norm between H and estimate
    if CREATE_DATA:
        save_csv_record(join(data_directory, csv_filename),
                        header,
                        append=False)

    # -- Default Graph parameters
    rep = 100
    randomize = False
    initial_h0 = None  # initial vector to start finding optimal H
    distribution = 'powerlaw'
    exponent = -0.3
    rep_differentGraphs = 1
    EC = True
    f_vec = [0.9 * pow(0.1, 1 / 12)**x for x in range(42)]
    fraction_of_minimum = 1.1  # scaling parameters that lead to optimum except for this scaling factor are included
    ymin2 = 0.28
    ymax2 = 500
    xmin = 0.001
    # xmin = 0.0005
    xmax = None
    xtick_lab = [0.001, 0.01, 0.1, 1]
    # ytick_lab1 = np.arange(0, 1, 0.1)
    ytick_lab2 = [0.3, 1, 10, 100, 1000]
    ymax1 = 1.2
    ymin1 = 0.001
    # ytick_lab1 = [0.001, 0.01, 0.1, 1]
    k = 3
    a = 1
    stratified = True
    gradient = False
    n = 10000
    # color_vec = ['blue', 'orange', 'red']
    color_vec = ["#4C72B0", "#55A868", "#C44E52", "#CCB974", "#64B5CD"]
    color_vec = ["#4C72B0", "#8172B2", "#C44E52"]
    # label_vec = [r'$\tilde {\mathbf{H}}$', r'$\tilde{\mathbf{H}}^{(5)}_{\mathrm{NB}}$', r'$\tilde {\mathbf{H}}^{(5)}_{\mathrm{NB}}$ r']
    label_vec = ['MCE', 'DCE', 'DCEr']
    marker_vec = ['s', 'x', 'o']
    legendPosition = 'upper right'

    # -- Options
    if CHOICE == 11:
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 10]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 12:
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 10]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 13:
        h = 8
        d = 10
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 10]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 14:
        h = 3
        d = 10
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 10]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 15:
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 100]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    # elif CHOICE == 16:
    #     n = 10000
    #     h = 3
    #     d = 10
    #     option_vec = ['opt1', 'opt2', 'opt3']
    #     scaling_vec = [0, 50, 50]
    #     randomize_vec = [False, False, True]
    #     length_vec = [1, 5, 5]

    elif CHOICE == 17:
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 100]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 18:
        n = 1000
        h = 3
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 10]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    # -- Options
    elif CHOICE == 19:
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 100]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]

    elif CHOICE == 20:
        h = 8
        d = 25
        option_vec = ['opt1', 'opt2', 'opt3']
        scaling_vec = [0, 10, 100]
        randomize_vec = [False, False, True]
        length_vec = [1, 5, 5]
        gradient = True
        legendPosition = 'center right'

    else:
        raise Warning("Incorrect choice!")

    alpha0 = np.array([a, 1., 1.])
    alpha0 = alpha0 / np.sum(alpha0)
    H0 = create_parameterized_H(k, h, symmetric=True)
    RANDOMSEED = None  # For repeatability
    random.seed(RANDOMSEED)  # seeds some other python random generator
    np.random.seed(
        seed=RANDOMSEED
    )  # seeds the actually used numpy random generator; both are used and thus needed
    # print("CHOICE: {}".format(CHOICE))

    # -- Create data
    if CREATE_DATA or ADD_DATA:
        for rs in range(1, rep_differentGraphs + 1):
            # print('Graph {}'.format(rs))

            # -- Create graph
            W, Xd = planted_distribution_model_H(n,
                                                 alpha=alpha0,
                                                 H=H0,
                                                 d_out=d,
                                                 distribution=distribution,
                                                 exponent=exponent,
                                                 directed=False,
                                                 debug=False)
            X0 = from_dictionary_beliefs(Xd)

            for r in range(1, rep + 1):
                # print('Repetition {}'.format(r))

                for f in f_vec:
                    # -- Sample labeled data
                    X1, ind = replace_fraction_of_rows(X0,
                                                       1 - f,
                                                       stratified=stratified)

                    # -- Calculate number of labeled neighbors
                    M_vec = M_observed(W, X1, distance=5, NB=True)
                    M = M_vec[1]
                    num_N = np.sum(M)
                    # print("f={:1.4f}, number labeled neighbors={}".format(f, num_N))
                    # print("M_vec:\n{}".format(M_vec))

                    # -- Create estimates and compare against GT
                    for option, scaling, randomize, length in zip(
                            option_vec, scaling_vec, randomize_vec,
                            length_vec):
                        H_est = estimateH(X1,
                                          W,
                                          method='DHE',
                                          variant=1,
                                          distance=length,
                                          EC=EC,
                                          weights=scaling,
                                          randomize=randomize,
                                          initial_H0=initial_h0,
                                          gradient=gradient)
                        diff = LA.norm(H_est - H0)

                        tuple = [str(datetime.datetime.now())]
                        text = [option, f, scaling, diff]
                        tuple.extend(text)
                        save_csv_record(join(data_directory, csv_filename),
                                        tuple)

                        # print("diff={:1.4f}, H_est:\n{}".format(diff, H_est))

    # -- Read, aggregate, and pivot data for all options
    df1 = pd.read_csv(join(data_directory, csv_filename))
    # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15)))

    # Aggregate repetitions
    df2 = df1.groupby(['option', 'f']).agg \
        ({'diff': [np.mean, np.std, np.size],  # Multiple Aggregates
          })
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values
                   ]  # flatten the column hierarchy
    df2.reset_index(inplace=True)  # remove the index hierarchy
    df2.rename(columns={'diff_size': 'count'}, inplace=True)
    # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15)))

    # Pivot table
    df3 = pd.pivot_table(df2,
                         index=['f'],
                         columns=['option'],
                         values=['diff_mean', 'diff_std'])  # Pivot
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))
    df3.columns = ['_'.join(col).strip() for col in df3.columns.values
                   ]  # flatten the column hierarchy
    df3.reset_index(inplace=True)  # remove the index hierarchy
    # df2.rename(columns={'time_size': 'count'}, inplace=True)
    # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30)))

    # Extract values
    X_f = df3['f'].values  # plot x values
    Y = []
    Y_std = []
    for option in option_vec:
        Y.append(df3['diff_mean_{}'.format(option)].values)
        Y_std.append(df3['diff_std_{}'.format(option)].values)

    # print("X_f:\n", X_f)
    # print("Y:\n", Y)
    # print("Y_std:\n", Y_std)

    if SHOW_FIG1:
        # -- Setup figure
        fig_filename = 'Fig_MHE_Optimal_ScalingFactor_diff_f_lambda10_{}.pdf'.format(
            CHOICE)
        mpl.rcParams['backend'] = 'pdf'
        mpl.rcParams['lines.linewidth'] = 3
        mpl.rcParams['font.size'] = 14
        mpl.rcParams['axes.labelsize'] = 20
        mpl.rcParams['axes.titlesize'] = 16
        mpl.rcParams['xtick.labelsize'] = 16
        mpl.rcParams['ytick.labelsize'] = 16
        mpl.rcParams['legend.fontsize'] = 16
        mpl.rcParams['axes.edgecolor'] = '111111'  # axes edge color
        mpl.rcParams['grid.color'] = '777777'  # grid color
        mpl.rcParams['figure.figsize'] = [4, 4]
        mpl.rcParams[
            'xtick.major.pad'] = 4  # padding of tick labels: default = 4
        mpl.rcParams[
            'ytick.major.pad'] = 4  # padding of tick labels: default = 4
        fig = plt.figure()
        ax = fig.add_axes([0.13, 0.17, 0.8, 0.8])

        # -- Draw the plots
        for i, (color, marker) in enumerate(zip(color_vec, marker_vec)):
            p = ax.plot(X_f,
                        Y[i],
                        color=color,
                        linewidth=3,
                        label=label_vec[i],
                        marker=marker)
            if i != 1:
                ax.fill_between(X_f,
                                Y[i] + Y_std[i],
                                Y[i] - Y_std[i],
                                facecolor=color,
                                alpha=0.2,
                                edgecolor='none')
        plt.xscale('log')
        plt.yscale('log')

        # -- Title and legend
        if distribution == 'uniform':
            distribution_label = ',$uniform'
        else:
            distribution_label = '$'
        plt.title(r'$\!\!\!n\!=\!{}\mathrm{{k}}, h\!=\!{}, d\!=\!{}{}'.format(
            int(n / 1000), h, d, distribution_label))
        handles, labels = ax.get_legend_handles_labels()
        legend = plt.legend(
            handles,
            labels,
            loc=legendPosition,  # 'upper right'
            handlelength=1.5,
            labelspacing=0,  # distance between label entries
            handletextpad=
            0.3,  # distance between label and the line representation
            # title='Variants',
            borderaxespad=0.2,  # distance between legend and the outer axes
            borderpad=0.1,  # padding inside legend box
        )
        frame = legend.get_frame()
        frame.set_linewidth(0.0)
        frame.set_alpha(0.9)  # 0.8

        # -- Figure settings and save
        plt.xticks(xtick_lab, xtick_lab)
        # plt.yticks(ytick_lab1, ytick_lab1)
        plt.grid(b=True,
                 which='minor',
                 axis='both',
                 alpha=0.2,
                 linestyle='solid',
                 linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        plt.grid(b=True,
                 which='major',
                 axis='y',
                 alpha=0.2,
                 linestyle='solid',
                 linewidth=0.5)  # linestyle='dashed', which='minor', axis='y',
        plt.xlabel(r'Label Sparsity $(f)$', labelpad=0)  # labelpad=0
        plt.ylabel(r'L2 norm', labelpad=-5)

        if xmin is None:
            xmin = plt.xlim()[0]
        if xmax is None:
            xmax = plt.xlim()[1]
        if ymin1 is None:
            ymin1 = plt.ylim()[1]
        if ymax1 is None:
            ymax1 = plt.ylim()[1]
        plt.xlim(xmin, xmax)
        plt.ylim(ymin1, ymax1)

        if CREATE_PDF:
            plt.savefig(join(figure_directory, fig_filename),
                        format='pdf',
                        dpi=None,
                        edgecolor='w',
                        orientation='portrait',
                        transparent=False,
                        bbox_inches='tight',
                        pad_inches=0.05,
                        frameon=None)

        if SHOW_FIG1:
            plt.show()
        if SHOW_PDF:
            os.system('{} "'.format(open_cmd[sys.platform]) +
                      join(figure_directory, fig_filename) +
                      '"')  # shows actually created PDF