def test_Diff_matrices(): """Creates two parameterized Hs, shows that standardized versions are identical""" print( "\n-- test_Diff_matrices(): 'create_parameterized_H', 'to_centering_beliefs', 'np.std', 'LA.norm' --" ) h = 2 H0 = create_parameterized_H(3, h, symmetric=True) print("H0:\n{}\n".format(H0)) H0c = to_centering_beliefs(H0) print("H0c (centered):\n{}\n".format(H0c)) std_H0 = np.std(H0) print("std(H0): {}".format(std_H0)) std_H0c = np.std(H0c) print("std(H0c): {}\n".format(std_H0c)) H0c_s = H0c.dot(1 / std_H0c) print("H0c_s (standardized centered):\n{}\n".format(H0c_s)) H1 = create_parameterized_H(3, h * 4, symmetric=True) print("H1 (4 times stronger potential):\n{}\n".format(H1)) H1c = to_centering_beliefs(H1) H1c_s = H1c.dot(1 / np.std(H1c)) print("H1c_s (standardized centered):\n{}\n".format(H1c_s)) diff = LA.norm(H0c_s - H1c_s) print("LA.norm(H0c_s - H1c_s) is quasi 0:\n{}\n".format(diff))
def calculate_accuracy(H, X_train, X_test, train_ind, test_ind, W, return_output, s=0.5): # all that is needed to propagate H0c = to_centering_beliefs(H) eps_max = eps_convergence_linbp_parameterized(H0c, W, method='noecho', alpha=alpha, beta=beta, gamma=gamma, X=X2) eps = s * eps_max F, actualIt, actualPercentageConverged = \ linBP_symmetric_parameterized(X_train, W, H*eps, method='noecho', alpha=alpha, beta=beta, gamma=gamma, numMaxIt=numMax, convergencePercentage=0.99, convergenceThreshold=0.99, debug=2) n, k = F.shape for i in range(n): if i not in test_ind: F[i] = np.zeros(k) accuracy_X = matrix_difference(X_test, F, ignore_rows=list(train_ind), similarity='accuracy') print("Holdout accuracy: {}".format(accuracy_X)) return_output.put(accuracy_X) ## For Parallel
def calculate_accuracy(H, X_train, X_test, train_ind, test_ind, W, s=0.5): """Propagates from X_train numMax times, calculates accuracy over X_test """ H0c = to_centering_beliefs(H) eps_max = eps_convergence_linbp_parameterized( H0c, W, # TODO: an optimized version could attempt to calculate the spectral radius fewer times and re-use it for multiple splits method='noecho', alpha=alpha, beta=beta, gamma=gamma, X=X2) eps = s * eps_max F, actualIt, actualPercentageConverged = linBP_symmetric_parameterized( X_train, W, H * eps, method='noecho', alpha=alpha, beta=beta, gamma=gamma, numMaxIt=numMax, convergencePercentage=0.99, convergenceThreshold=0.99, debug=2) n, k = F.shape for i in range(n): if i not in test_ind: F[i] = np.zeros(k) # TODO For label imbalance, better to use CLASSWISE (macro-averaging) here accuracy_X = matrix_difference(X_test, F, ignore_rows=list(train_ind), similarity='accuracy') # print("accuracy now is {}".format(accuracy_X)) return accuracy_X
def _f_worker_(X0, W, f, f_index): RANDOMSEED = None # For repeatability random.seed(RANDOMSEED) # seeds some other python random generator np.random.seed( seed=RANDOMSEED ) # seeds the actually used numpy random generator; both are used and thus needed X1, ind = replace_fraction_of_rows(X0, 1 - f, avoidNeighbors=avoidNeighbors, W=W, stratified=stratified) X2 = introduce_errors(X1, ind, err) for option_index, (label, select_lambda, learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \ enumerate(zip(labels, select_lambda_vec, learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)): learn_time = -1 # -- Learning if learning_method == 'GT': H2c = H0c elif learning_method == 'Heuristic': # print('Heuristic') H2c = H_heuristic elif learning_method == 'Holdout': # print('Holdout') H2 = estimateH_baseline_serial( X2, ind, W, numMax=numMaxIt, # ignore_rows=ind, numberOfSplits=numberOfSplits, # method=learning_method, variant=1, # distance=length, EC=EC, alpha=alpha, beta=beta, gamma=gamma, doubly_stochastic=doubly_stochastic) H2c = to_centering_beliefs(H2) else: if "DCEr" in learning_method: learning_method = "DCEr" elif "DCE" in learning_method: learning_method = "DCE" # -- choose optimal lambda: allows to specify different lambda for different f # print("option: ", option_index) if select_lambda == True: weight = lambda_vec[f_index] # print("weight : ", weight) else: weight = weights # -- learn H learn_start = time.time() H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weight, randomrestarts=num_restarts, randomize=randomize, constraints=constraints, gradient=gradient, doubly_stochastic=doubly_stochastic) learn_time = time.time() - learn_start H2c = to_centering_beliefs(H2) # if learning_method not in ['GT', 'GS']: # print(FILENAMEZ, f, learning_method) # print(H2c) # -- Propagation prop_start = time.time() # X2c = to_centering_beliefs(X2, ignoreZeroRows=True) # try without eps_max = eps_convergence_linbp_parameterized(H2c, W, method='noecho', alpha=alpha, beta=beta, gamma=gamma, X=X2) eps = s * eps_max # print("Max eps: {}, eps: {}".format(eps_max, eps)) # eps = 1 try: F, actualIt, actualPercentageConverged = \ linBP_symmetric_parameterized(X2, W, H2c * eps, method='noecho', alpha=alpha, beta=beta, gamma=gamma, numMaxIt=numMaxIt, convergencePercentage=convergencePercentage_W, debug=2) prop_time = time.time() - prop_start if Macro_Accuracy: accuracy_X = matrix_difference_classwise(X0, F, ignore_rows=ind) precision = matrix_difference_classwise(X0, F, similarity='precision', ignore_rows=ind) recall = matrix_difference_classwise(X0, F, similarity='recall', ignore_rows=ind) else: accuracy_X = matrix_difference(X0, F, ignore_rows=ind) precision = matrix_difference(X0, F, similarity='precision', ignore_rows=ind) recall = matrix_difference(X0, F, similarity='recall', ignore_rows=ind) result = [str(datetime.datetime.now())] text = [ label, f, accuracy_X, precision, recall, learn_time, prop_time ] result.extend(text) # print("method: {}, f: {}, actualIt: {}, accuracy: {}, precision:{}, recall: {}, learning time: {}, propagation time: {}".format(label, f, actualIt, accuracy_X, precision, recall, learn_time, prop_time)) save_csv_record(join(data_directory, csv_filename), result) except ValueError as e: print("ERROR: {} with {}: d={}, h={}".format( e, learning_method, d, h)) raise e return 'success'
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False): global n global d global rep_SameGraph global FILENAMEZ global csv_filename global initial_h0 global exponent global length global variant global alpha_vec global beta_vec global gamma_vec global s_vec global clip_on_vec global numMaxIt_vec # Plotting Parameters global xtick_lab global xtick_labels global ytick_lab global xmax global xmin global ymin global ymax global labels global facecolor_vec global draw_std_vec global linestyle_vec global linewidth_vec global marker_vec global markersize_vec global legend_location global option_vec global learning_method_vec global Macro_Accuracy global EC global constraints global weight_vec global randomize_vec global k global err global avoidNeighbors global convergencePercentage_W global stratified global gradient global doubly_stochastic global num_restarts global numberOfSplits global H_heuristic global select_lambda_vec global lambda_vec global f_vec global H0c # -- Setup CHOICE = choice #300 Prop37, 400 MovieLens, 500 Yelp, 600 Flickr, 700 DBLP, 800 Enron experiments = [CHOICE] CREATE_DATA = create_data ADD_DATA = add_data SHOW_PDF = show_pdf SHOW_PLOT = show_plot CREATE_PDF = create_pdf SHOW_FIG = SHOW_PLOT or SHOW_PDF or CREATE_PDF STD_FILL = True TIMING = False CALCULATE_DATA_STATISTICS = False # -- Default Graph parameters rep_SameGraph = 10 # iterations on same graph initial_h0 = None # initial vector to start finding optimal H exponent = -0.3 length = 5 variant = 1 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 clip_on_vec = [True] * 10 numMaxIt_vec = [10] * 10 # Plotting Parameters xtick_lab = [0.001, 0.01, 0.1, 1] xtick_labels = ['0.1\%', '1\%', '10\%', '100\%'] ytick_lab = np.arange(0, 1.1, 0.1) xmax = 1 xmin = 0.0001 ymin = 0.3 ymax = 0.7 labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [False] * 4 + [True] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] Macro_Accuracy = False EC = True # Non-backtracking for learning constraints = True # True weight_vec = [None] * 3 + [10, 10] * 2 randomize_vec = [False] * 4 + [True] * 2 k = 3 err = 0 avoidNeighbors = False convergencePercentage_W = None stratified = True gradient = True doubly_stochastic = True num_restarts = None raw_std_vec = range(10) numberOfSplits = 1 select_lambda_vec = [False] * 20 lambda_vec = None f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] FILENAMEZ = "" legend_location = "" fig_label = "" H_heuristic = "" def choose(choice): global n global d global rep_SameGraph global FILENAMEZ global initial_h0 global exponent global length global variant global alpha_vec global beta_vec global gamma_vec global s_vec global clip_on_vec global numMaxIt_vec # Plotting Parameters global xtick_lab global xtick_labels global ytick_lab global xmax global xmin global ymin global ymax global labels global facecolor_vec global draw_std_vec global linestyle_vec global linewidth_vec global marker_vec global markersize_vec global legend_location global option_vec global learning_method_vec global Macro_Accuracy global EC global constraints global weight_vec global randomize_vec global k global err global avoidNeighbors global convergencePercentage_W global stratified global gradient global doubly_stochastic global num_restarts global numberOfSplits global H_heuristic global select_lambda_vec global lambda_vec global f_vec # -- Default Graph parameters if choice == 0: None elif choice == 304: ## with varying weights FILENAMEZ = 'prop37' Macro_Accuracy = True gradient = True fig_label = 'Prop37' legend_location = 'lower right' n = 62000 d = 34.8 select_lambda_vec = [False] * 5 f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] elif choice == 305: # DCEr Only experiment choose(605) choose(304) select_lambda_vec = [False] * 6 elif choice == 306: choose(304) select_lambda_vec = [False] * 3 + [True] * 3 lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec learning_method_vec.append('Holdout') labels.append('Holdout') elif choice == 307: # heuristic comparison choose(304) select_lambda_vec = [False] * 3 + [True] * 3 lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec learning_method_vec.append('Heuristic') labels.append('Heuristic') H_heuristic = np.array([[.476, .0476, .476], [.476, .0476, .476], [.476, .476, .0476]]) # -- MovieLens dataset elif choice == 401: FILENAMEZ = 'movielens' Macro_Accuracy = True gradient = True fig_label = 'MovieLens' legend_location = 'upper left' n = 26850 d = 25.0832029795 elif choice == 402: choose(401) select_lambda_vec = [False] * 3 + [ True ] * 3 # allow to choose lambda for different f in f_vec lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 403: choose(402) ymin = 0.3 ymax = 1.0 learning_method_vec.append('Holdout') labels.append('Holdout') elif choice == 404: choose(401) select_lambda_vec = [ True ] * 3 # allow to choose lambda for different f in f_vec lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec labels = ['GS', 'DCEr', 'Homophily'] facecolor_vec = ['black', "#C44E52", "#64B5CD"] draw_std_vec = [False, True, False] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 2, 2, 2, 2] marker_vec = [None, '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] weight_vec = [None, 10, None] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] randomize_vec = [False, True, False] learning_method_vec = ['GT', 'DHE'] #TODO elif choice == 405: # DCEr ONLY experiment choose(605) choose(401) learning_method_vec += ['Holdout'] labels += ['Holdout'] elif choice == 406: # comparison with a static heuristic matrix choose(402) learning_method_vec += ['Heuristic'] labels += ['Heuristic'] H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476], [.476, .476, .0476]]) elif choice == 407: choose(402) ymin = 0.3 ymax = 1.0 lambda_vec = [1] * 21 # same length as f_vec elif choice == 408: choose(402) ymin = 0.3 ymax = 1.0 lambda_vec = [10] * 21 # same length as f_vec # DO NOT RUN WITH CREATE_DATA=True, if you do please restore the data from # data/sigmod-movielens-fig.csv elif choice == 409: choose(402) facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#8172B2", "#C44E52", "#C44E52", "#CCB974", "#64B5CD" ] labels = [ 'GS', 'LCE', 'MCE', 'DCE1', 'DCE10', 'DCEr1', 'DCEr10', 'Holdout' ] draw_std_vec = [False] * 5 + [True] * 2 + [False] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [2, 2, 2, 2, 2, 2, 2, 2] marker_vec = [None, 'o', 'x', 's', 'p', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8, 8] option_vec = [ 'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8' ] legend_location = 'upper left' ymin = 0.3 ymax = 1.0 lambda_vec = [10] * 21 # same length as f_vec # -- Yelp dataset elif choice == 501: FILENAMEZ = 'yelp' Macro_Accuracy = True weight_vec = [None] * 3 + [10, 10] gradient = True ymin = 0.1 ymax = 0.75 fig_label = 'Yelp' legend_location = 'upper left' n = 4301900 # for figure d = 6.56 # for figure # -- Flickr dataset #elif choice == 601: # FILENAMEZ = 'flickr' # Macro_Accuracy = True # fig_label = 'Flickr' # legend_location = 'lower right' # ymin = 0.3 # ymax = 0.7 # n = 2007369 # d = 18.1 #elif choice == 602: ## with varying weights # choose(601) # select_lambda_vec = [False] * 4 + [True]*2 # allow to choose lambda for different f in f_vec # f_vec = [0.9 * pow(0.1, 1 / 5) ** x for x in range(21)] # lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec #elif choice == 603: ## with varying weights # choose(602) # select_lambda_vec = [False] * 3 + [True] * 2 # allow to choose lambda for different f in f_vec # # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6 # same length as f_vec #elif choice == 604: ## with weight = 1 # choose(603) # lambda_vec = [0.5] * 21 # same length as f_vec # -- Flickr dataset elif choice == 601: FILENAMEZ = 'flickr' Macro_Accuracy = True fig_label = 'Flickr' legend_location = 'lower right' n = 2007369 d = 18.1 elif choice == 602: ## with varying weights choose(601) select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 603: ## with varying weights choose(602) select_lambda_vec = [False] * 3 + [ True ] * 2 # allow to choose lambda for different f in f_vec # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6 # same length as f_vec elif choice == 604: ## with weight = 1 draw_std_vec = [4] choose(603) lambda_vec = [0.5] * 21 # same length as f_vec elif choice == 605: choose(601) facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD", 'orange' ] draw_std_vec = [False] + [True] * 10 linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [3] * 10 marker_vec = [None, 'o', 'x', '^', 'v', '+', 'o', 'x'] markersize_vec = [0] + [8] * 10 randomize_vec = [True] * 8 option_vec = [ 'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8' ] learning_method_vec = [ 'GT', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE' ] select_lambda_vec = [False] * 8 f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec weight_vec = [0, 0, 1, 2, 5, 10, 15] labels = ['GT'] + [ i + ' {}'.format(weight_vec[ix]) for ix, i in enumerate(['DCEr'] * 6) ] elif choice == 606: # heuristic experiment choose(602) labels.append('Heuristic') learning_method_vec.append('Heuristic') H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476], [.476, .476, .0476]]) # -- DBLP dataset elif choice == 701: FILENAMEZ = 'dblp' Macro_Accuracy = True ymin = 0.2 ymax = 0.5 fig_label = 'DBLP' legend_location = 'lower right' n = 2241258 # for figure d = 26.11 # for figure # -- ENRON dataset elif choice == 801: FILENAMEZ = 'enron' Macro_Accuracy = True ymin = 0.3 ymax = 0.75 fig_label = 'Enron' f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] legend_location = 'upper left' n = 46463 # for figures d = 23.4 # for figures elif choice == 802: ### WITH ADAPTIVE WEIGHTS choose(801) select_lambda_vec = [False] * 4 + [ True ] * 2 # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 803: ### WITH ADAPTIVE WEIGHTS choose(802) lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [ 1 ] * 6 # same length as f_vec elif choice == 804: choose(803) elif choice == 805: choose(605) choose(801) #learning_method_vec += ['Holdout'] #labels += ['Holdout'] elif choice == 806: # Heuristic experiment choose(802) learning_method_vec += ['Heuristic'] labels += ['Heuristic'] H_heuristic = np.array([[0.76, 0.08, 0.08, 0.08], [0.08, 0.08, 0.76, 0.08], [0.08, 0.76, 0.08, 0.76], [0.08, 0.08, 0.76, 0.08]]) elif choice == 821: FILENAMEZ = 'enron' Macro_Accuracy = True constraints = True # True gradient = True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [0.2, 0.2] randomize_vec = [False] * 4 + [True] xmin = 0.0001 ymin = 0.0 ymax = 0.7 labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Enron' legend_location = 'lower right' n = 46463 # for figures d = 23.4 # for figures alpha = 0.0 beta = 0.0 gamma = 0.0 s = 0.5 numMaxIt = 10 select_lambda_vec = [False] * 3 + [True] * 2 lambda_vec = [0.2] * 13 + [10] * 8 # same length as f_vec # -- Cora dataset elif choice == 901: FILENAMEZ = 'cora' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.001 ymin = 0.0 ymax = 0.9 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Cora' legend_location = 'lower right' n = 2708 d = 7.8 # -- Citeseer dataset elif CHOICE == 1001: FILENAMEZ = 'citeseer' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.001 ymin = 0.0 ymax = 0.75 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Citeseer' legend_location = 'lower right' n = 3312 d = 5.6 elif CHOICE == 1101: FILENAMEZ = 'hep-th' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.0001 ymin = 0.0 ymax = 0.1 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Hep-th' legend_location = 'lower right' n = 27770 d = 5.6 elif choice == 1102: choose(1101) Macro_Accuracy = True elif CHOICE == 1204: FILENAMEZ = 'pokec-gender' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.000015 ymin = 0.0 ymax = 0.75 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [0, 3, 4, 4, 4, 4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Pokec-Gender' legend_location = 'lower right' n = 1632803 d = 54.6 else: raise Warning("Incorrect choice!") for choice in experiments: choose(choice) filename = 'Fig_End-to-End_accuracy_realData_{}_{}'.format( choice, FILENAMEZ) csv_filename = '{}.csv'.format(filename) header = [ 'currenttime', 'method', 'f', 'accuracy', 'precision', 'recall', 'learntime', 'proptime' ] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # print("choice: {}".format(choice)) # --- print data statistics if CALCULATE_DATA_STATISTICS: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) d = (len(W.nonzero()[0]) * 2) / n k = len(X0[0]) print("FILENAMEZ:", FILENAMEZ) print("k:", k) print("n:", n) print("d:", d) # -- Graph statistics n_vec = calculate_nVec_from_Xd(Xd) print("n_vec:\n", n_vec) d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd) print("d_vec:\n", d_vec) P = calculate_Ptot_from_graph(W, Xd) print("P:\n", P) for i in range(k): Phi = calculate_degree_correlation(W, X0, i, NB=True) print("Degree Correlation, Class {}:\n{}".format(i, Phi)) # -- Various compatibilities H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) print("H0 w/ constraints:\n", np.round(H0, 2)) #raw_input() # Why? H2 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H4 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H5 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H6 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H7 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) print() # print("H MCE w/o constraints:\n", np.round(H0, 3)) print("H MCE w/ constraints:\n", np.round(H2, 3)) # print("H DCE 2 w/o constraints:\n", np.round(H4, 3)) print("H DCE 2 w/ constraints:\n", np.round(H5, 3)) # print("H DCE 10 w/o constraints:\n", np.round(H6, 3)) print("H DCE 20 w/ constraints:\n", np.round(H7, 3)) print() H_row_vec = H_observed(W, X0, 3, NB=True, variant=1) print("H_est_1:\n", np.round(H_row_vec[0], 3)) print("H_est_2:\n", np.round(H_row_vec[1], 3)) print("H_est_3:\n", np.round(H_row_vec[2], 3)) # --- Create data if CREATE_DATA or ADD_DATA: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) ## number of nodes in graph k = len(X0[0]) d = (len(W.nonzero()[0]) * 2) / n #print(n) #print(d) #print("contraint = {}".format(constraints)) #print('select lambda: {}'.format(len(select_lambda_vec))) #print('learning method: {}'.format(len(learning_method_vec))) #print('alpha: {}'.format(len(alpha_vec))) #print('beta: {}'.format(len(beta_vec))) #print('gamma: {}'.format(len(gamma_vec))) #print('s: {}'.format(len(s_vec))) #print('maxit: {}'.format(len(numMaxIt_vec))) #print('weight: {}'.format(len(weight_vec))) #print('randomize: {}'.format(len(randomize_vec))) # --- Calculating True Compatibility matrix H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=constraints, gradient=gradient, doubly_stochastic=doubly_stochastic) # print(H0) H0c = to_centering_beliefs(H0) num_results = len(f_vec) * len(learning_method_vec) * rep_SameGraph # Starts a thread pool with at least 2 threads, and a lot more if you happen to be on a supercomputer pool = multiprocessing.Pool(max(2, multiprocessing.cpu_count() - 4)) f_processes = f_vec * rep_SameGraph workers = [] results = [(X0, W, f, ix) for ix, f in enumerate(f_vec)] * rep_SameGraph # print('Expected results: {}'.format(num_results)) try: # hacky fix due to a bug in 2.7 multiprocessing # Distribute work for evaluating accuracy over the thread pool using # a hacky method due to python 2.7 multiprocessing not being fully # featured pool.map_async(multi_run_wrapper, results).get(num_results * 2) except multiprocessing.TimeoutError as e: continue finally: pool.close() pool.join() # -- Read data for all options and plot df1 = pd.read_csv(join(data_directory, csv_filename)) acc_filename = '{}_accuracy_plot.pdf'.format(filename) pr_filename = '{}_PR_plot.pdf'.format(filename) if TIMING: print('=== {} Timing Results ==='.format(FILENAMEZ)) print('Prop Time:\navg: {}\nstddev: {}'.format( np.average(df1['proptime'].values), np.std(df1['proptime'].values))) for learning_method in labels: rs = df1.loc[df1["method"] == learning_method] avg = np.average(rs['learntime']) std = np.std(rs['learntime']) print('{} Learn Time:\navg: {}\nstd: {}'.format( learning_method, avg, std)) sslhv.plot(df1, join(figure_directory, acc_filename), n=n, d=d, k=k, labels=labels, dataset=FILENAMEZ, line_styles=linestyle_vec, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, marker_sizes=markersize_vec, draw_stds=draw_std_vec, markers=marker_vec, line_colors=facecolor_vec, line_widths=linewidth_vec, legend_location=legend_location, show=SHOW_PDF, save=CREATE_PDF, show_plot=SHOW_PLOT)
def test_approx_spectral_radius(): print("\n-- 'approx_spectral_radius' --") # --- Create the graph n = 1000 alpha0 = [0.3334, 0.3333, 0.3333] h = 5 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) m = 10000 distribution = 'powerlaw' # uniform powerlaw exponent = -0.3 backEdgesAllowed = True sameInAsOutDegreeRanking = False debug = False start = time.time() W, Xd = planted_distribution_model( n, alpha=alpha0, P=P, m=m, distribution=distribution, exponent=exponent, backEdgesAllowed=backEdgesAllowed, sameInAsOutDegreeRanking=sameInAsOutDegreeRanking, debug=debug) print("n: {}".format(n)) print("Time for graph generation: {}\n".format(time.time() - start)) # --- Bigger graph with Kronecker M = kron(W.transpose(), P) # --- Time two variants start = time.time() rho1 = approx_spectral_radius(M, pyamg=True) print("Time for pyamg spectral radius: {}".format(time.time() - start)) print("rho1: {}".format(rho1)) start = time.time() rho2 = approx_spectral_radius(M, pyamg=False) print("Time for scipy spectral radius: {}".format(time.time() - start)) print("rho2: {}".format(rho2)) # --- 3 methods for spectral radisu, including non-sparse matrices H = create_parameterized_H(20, 2, symmetric=True) H = to_centering_beliefs(H) print("\nH:\n{}".format(H)) start = time.time() rho1 = approx_spectral_radius(H, pyamg=True) print("Time for pyamg spectral radius: {}".format(time.time() - start)) print("rho1: {}".format(rho1)) start = time.time() rho2 = approx_spectral_radius(H, pyamg=False) print("Time for scipy spectral radius: {}".format(time.time() - start)) print("rho2: {}".format(rho2)) start = time.time() rho3 = approx_spectral_radius(H, pyamg=False, sparse=False) print("Time for non-sparse numpy spectral radius: {}".format(time.time() - start)) print("rho3: {}".format(rho3)) # --- For k=2, scipy made to default to non-sparse H = create_parameterized_H(2, 2, symmetric=True) print("\nH (k=2):\n{}".format(H)) start = time.time() rho4 = approx_spectral_radius(H, pyamg=False) print("Time for scipy spectral radius: {}".format(time.time() - start)) print("rho4: {}".format(rho4))
def test_matrix_difference(): print( "\n-- 'matrix_difference' (cosine/cosine_ratio/l2), 'to_centering_beliefs' --" ) X0 = np.array([ [2, 0, 0], [2, 0, 2], [0, 1, 0], [0, 0, 3], [0, 0, 3], [1, 0, 2], [0, 3, 3], [0, 0, 0], [0, 1, 0], [0, 1, 0], [9, 9, 9], [9, 9, 9], [100, 100, 100], ]) X1 = np.array([ [1, 1, 2], [2, 1, 2], [3, 4, 0], [1, 1, 2], [2, 1, 1], [1, 2, 2], [1, 2, 3], [0, 0, 0], [1, 0, 0], [0, 2, 0], [9, 9, 9], [8, 9, 9], [100, 100, 101], ]) print("X0:\n", X0) print("X1:\n", X1) result = matrix_difference(X0, X1, similarity='cosine', vector=True) print("cosine:\n", result) result = matrix_difference(X0, X1, similarity='cosine_ratio', vector=True) print("cosine_ratio:\n", result) result = matrix_difference(X0, X1, similarity='l2', vector=True) print("l2:\n", result) X0 = np.array([[1., 0., 0.], [0.30804075, 0.56206462, 0.12989463], [0.32434628, 0.33782686, 0.33782686], [0.30804075, 0.12989463, 0.56206462], [0.14009173, 0.71981654, 0.14009173], [0.32273419, 0.21860539, 0.45866042], [0.33804084, 0.32391832, 0.33804084], [0.45866042, 0.21860539, 0.32273419]]) X1 = np.array([[1., 0., 0.], [0.22382029, 0.45296374, 0.32321597], [0.32434628, 0.33782686, 0.33782686], [0.22382029, 0.32321597, 0.45296374], [0.2466463, 0.5067074, 0.2466463], [0.32273419, 0.21860539, 0.45866042], [0.33804084, 0.32391832, 0.33804084], [0.45866042, 0.21860539, 0.32273419]]) print("\nX0:\n", X0) print("X1:\n", X1) result = matrix_difference(X0, X1, similarity='cosine_ratio', vector=True) print("cosine_ratio:\n", result) # X0z = row_normalize_matrix(X0, norm='zscores') # X1z = row_normalize_matrix(X1, norm='zscores') X0z = to_centering_beliefs(X0) X1z = to_centering_beliefs(X1) print("\nX0z:\n", X0z) print("X1z:\n", X1z) result = matrix_difference(X0z, X1z, similarity='cosine_ratio', vector=True) print("cosine_ratio zscores:\n", result) # actualPercentageConverged = matrix_convergence_percentage(X0z, X1z, threshold=convergenceCosineSimilarity) X0 = np.array([1, 0, 0]) X1 = np.array([1, 1, 0]) print("\nX0:\n", X0) print("X1:\n", X1) result = matrix_difference(X0, X1, similarity='cosine_ratio', vector=True) print("cosine_ratio zscores:\n", result) X0 = np.array([-30, -15, 45]) X1 = np.array([-15, -30, 45]) print("\nX0:\n", X0) print("X1:\n", X1) result = matrix_difference(X0, X1, similarity='cosine_ratio', vector=True) print("cosine_ratio zscores:\n", result)
def test_transform_beliefs(): print("\n-- 'check_normalized_beliefs', 'to_centering_beliefs' --") X = np.array([[1.0001, 0, 0]]) print("X:", X) assert check_normalized_beliefs(X) print("X centered:", to_centering_beliefs(X)) Y = np.array([0.9999, 0, 0]) print("Y:", Y) assert check_normalized_beliefs(Y) print("Y centered:", to_centering_beliefs(Y)) Z = np.array([[1.001, 0, 0]]) print("Z:", Z) assert not check_normalized_beliefs(Z) W = np.array([0.999, 0, 0]) print("W:", W) assert not check_normalized_beliefs(W) print("\n-- 'check_centered_beliefs', 'from_centering_beliefs'") Xc = np.array([[1.0001, -1, 0]]) print("Xc: ", Xc) assert check_centered_beliefs(Xc) print("Xc uncentered: ", from_centering_beliefs(Xc)) Yc = np.array([0.9999, -1, 0]) print("Yc: ", Yc) assert check_centered_beliefs(Yc) print("Yc uncentered: ", from_centering_beliefs(Yc)) Zc = np.array([[1.001, -1, 0]]) print("Zc: ", Zc) assert not check_centered_beliefs(Zc) Wc = np.array([0.999, -1, 0]) print("Wc: ", Wc) assert not check_centered_beliefs(Wc) print( "\n-- 'to_centering_beliefs', 'from_centering_beliefs' for matrices --" ) X = np.array([[1, 0, 0], [0.8, 0.2, 0], [1. / 3, 1. / 3, 1. / 3], [0, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 0]]) print("X original:\n", X) print("np.sum(X,1):\n", np.sum(X, 1)) print("X.sum(axis=1, keepdims=True):\n", X.sum(axis=1, keepdims=True)) print("X.shape:", X.shape) print("len(X.shape): ", len(X.shape)) Xc = to_centering_beliefs(X, ignoreZeroRows=True) print("X centered (ignoringZeroRows=True):\n", Xc) Y = from_centering_beliefs(Xc) print("X again un-centered:\n", Y) fileNameX = join(data_directory, 'Torus_X.csv') X, _, _ = load_X(fileNameX, n=8, zeroindexing=False) X = X.dot(0.1) print("\nCentered X for Torus example as input\n", X) Xc = from_centering_beliefs(X) print("X un-centered:\n", Xc) X = np.array([[1, 0, 0]]) print("\nX original:\n", X) Xc = to_centering_beliefs(X) print("X centered:\n", Xc) Y = from_centering_beliefs(Xc) print("X back non-centered:\n", Y) X = np.array([1, 0, 0]) print("\nX original:\n", X) print("np.sum(X,0):", np.sum(X, 0)) print("X.sum(axis=0, keepdims=True):", X.sum(axis=0, keepdims=True)) print("X.shape: ", X.shape) print("len(X.shape): ", len(X.shape))
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False): CHOICE = choice CREATE_DATA = create_data ADD_DATA = add_data SHOW_PLOT = show_plot SHOW_PDF = show_pdf CREATE_PDF = create_pdf STD_FILL = True # SHORTEN_LENGTH = False fig_filename = 'Fig_homophily_{}.pdf'.format(CHOICE) csv_filename = 'Fig_homophily_{}.csv'.format(CHOICE) header = ['currenttime', 'option', 'f', 'accuracy'] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # -- Default Graph parameters k = 3 rep_DifferentGraphs = 1 rep_SameGraph = 2 initial_h0 = None distribution = 'powerlaw' exponent = -0.3 length = 5 constraint = True variant = 1 EC = True # Non-backtracking for learning global f_vec, labels, facecolor_vec s = 0.5 err = 0 numMaxIt = 10 avoidNeighbors = False convergencePercentage_W = None stratified = True clip_on_vec = [True] * 10 draw_std_vec = range(10) ymin = 0.3 ymax = 1 xmin = 0.001 xmax = 1 xtick_lab = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1] xtick_labels = ['1e-5', '0.01\%', '0.1\%', '1\%', '10\%', '100\%'] ytick_lab = np.arange(0, 1.1, 0.1) linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [5, 2, 3, 3, 3, 3] + [3]*10 marker_vec = [None, '^', 'v', 'o', '^'] + [None]*10 markersize_vec = [0, 8, 8, 8, 6, 6] + [6]*10 facecolor_vec = ['black', "#C44E52", "#64B5CD"] # -- Options with propagation variants if CHOICE == 101: n = 10000 h = 3 d = 15 f_vec = [0.9 * pow(0.1, 1 / 5) ** x for x in range(21)] option_vec = ['opt1', 'opt2', 'opt3'] learning_method_vec = ['GT','DHE','Homophily'] weight_vec = [None] + [10] + [None] randomize_vec = [None] + [True] + [None] xmin = 0.001 ymin = 0.3 ymax = 1 labels = ['GS', 'DCEr', 'Homophily'] else: raise Warning("Incorrect choice!") a = 1 alpha0 = np.array([a, 1., 1.]) alpha0 = alpha0 / np.sum(alpha0) H0 = create_parameterized_H(k, h, symmetric=True) RANDOMSEED = None # For repeatability random.seed(RANDOMSEED) # seeds some other python random generator np.random.seed(seed=RANDOMSEED) # seeds the actually used numpy random generator; both are used and thus needed # print("CHOICE: {}".format(CHOICE)) # -- Create data if CREATE_DATA or ADD_DATA: for i in range(rep_DifferentGraphs): # create several graphs with same parameters W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d, distribution=distribution, exponent=exponent, directed=False, debug=False) X0 = from_dictionary_beliefs(Xd) for j in range(rep_SameGraph): # repeat several times for same graph # print("Graph:{} and j: {}".format(i,j)) ind = None for f in f_vec: X1, ind = replace_fraction_of_rows(X0, 1-f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=ind, stratified=stratified) X2 = introduce_errors(X1, ind, err) for option_index, (option, learning_method, weights, randomize) in \ enumerate(zip(option_vec, learning_method_vec, weight_vec, randomize_vec)): # -- Learning if learning_method == 'GT': H2 = H0 elif learning_method == 'Homophily': H2 = np.identity(k) elif learning_method == 'DHE': H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize, constraints=constraint) # print("learning_method:", learning_method) # print("H:\n{}".format(H2)) # -- Propagation H2c = to_centering_beliefs(H2) X2c = to_centering_beliefs(X2, ignoreZeroRows=True) try: eps_max = eps_convergence_linbp_parameterized(H2c, W, method='noecho', X=X2) eps = s * eps_max F, actualIt, actualPercentageConverged = \ linBP_symmetric_parameterized(X2, W, H2c * eps, method='noecho', numMaxIt=numMaxIt, convergencePercentage=convergencePercentage_W, debug=2) except ValueError as e: print ( "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h)) else: accuracy_X = matrix_difference_classwise(X0, F, ignore_rows=ind) tuple = [str(datetime.datetime.now())] text = [option_vec[option_index], f, accuracy_X] tuple.extend(text) # print("option: {}, f: {}, actualIt: {}, accuracy: {}".format(option_vec[option_index], f, actualIt, accuracy_X)) save_csv_record(join(data_directory, csv_filename), tuple) # -- Read, aggregate, and pivot data for all options df1 = pd.read_csv(join(data_directory, csv_filename)) # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15))) desred_decimals = 7 df1['f'] = df1['f'].apply(lambda x: round(x,desred_decimals)) # rounding due to different starting points # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15))) # Aggregate repetitions df2 = df1.groupby(['option', 'f']).agg \ ({'accuracy': [np.mean, np.std, np.size], # Multiple Aggregates }) df2.columns = ['_'.join(col).strip() for col in df2.columns.values] # flatten the column hierarchy df2.reset_index(inplace=True) # remove the index hierarchy df2.rename(columns={'accuracy_size': 'count'}, inplace=True) # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(10))) # Pivot table df3 = pd.pivot_table(df2, index=['f'], columns=['option'], values=['accuracy_mean', 'accuracy_std'] ) # Pivot # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30))) df3.columns = ['_'.join(col).strip() for col in df3.columns.values] # flatten the column hierarchy df3.reset_index(inplace=True) # remove the index hierarchy # df2.rename(columns={'time_size': 'count'}, inplace=True) # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(10))) # Extract values X_f = df3['f'].values # plot x values Y=[] Y_std=[] for option in option_vec: Y.append(df3['accuracy_mean_{}'.format(option)].values) if STD_FILL: Y_std.append(df3['accuracy_std_{}'.format(option)].values) if SHORTEN_LENGTH: SHORT_FACTOR = 2 ## KEEP EVERY Nth ELEMENT X_f = np.copy(X_f[list(range(0, len(X_f), SHORT_FACTOR)), ]) for i in range(len(Y)): Y[i] = np.copy(Y[i][list(range(0, len(Y[i]), SHORT_FACTOR)), ]) if STD_FILL: Y_std[i] = np.copy(Y_std[i][list(range(0, len(Y_std[i]), SHORT_FACTOR)),]) if CREATE_PDF or SHOW_PLOT or SHOW_PDF: # -- Setup figure mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']}) mpl.rcParams['axes.labelsize'] = 20 mpl.rcParams['xtick.labelsize'] = 16 mpl.rcParams['ytick.labelsize'] = 16 mpl.rcParams['legend.fontsize'] = 14 mpl.rcParams['grid.color'] = '777777' # grid color mpl.rcParams['xtick.major.pad'] = 2 # padding of tick labels: default = 4 mpl.rcParams['ytick.major.pad'] = 1 # padding of tick labels: default = 4 mpl.rcParams['xtick.direction'] = 'out' # default: 'in' mpl.rcParams['ytick.direction'] = 'out' # default: 'in' mpl.rcParams['font.size'] = 16 mpl.rcParams['axes.titlesize'] = 16 mpl.rcParams['figure.figsize'] = [4, 4] fig = figure() ax = fig.add_axes([0.13, 0.17, 0.8, 0.8]) # -- Drawing if STD_FILL: for choice, (option, facecolor) in enumerate(zip(option_vec, facecolor_vec)): ax.fill_between(X_f, Y[choice] + Y_std[choice], Y[choice] - Y_std[choice], facecolor=facecolor, alpha=0.2, edgecolor=None, linewidth=0) ax.plot(X_f, Y[choice] + Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid') ax.plot(X_f, Y[choice] - Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid') for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \ enumerate(zip(option_vec, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)): P = ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker, markersize=markersize, markeredgewidth=1, clip_on=clip_on, markeredgecolor='black') plt.xscale('log') # -- Title and legend distribution_label = '$' if distribution == 'uniform': distribution_label = ',$uniform' n_label = '{}k'.format(int(n / 1000)) if n < 1000: n_label='{}'.format(n) a_label = '' if a != 1: a_label = ', a\!=\!{}'.format(a) titleString = r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}{}{}'.format(n_label, d, h, a_label, distribution_label) plt.title(titleString) handles, labels = ax.get_legend_handles_labels() legend = plt.legend(handles, labels, loc='upper left', # 'upper right' handlelength=2, labelspacing=0, # distance between label entries handletextpad=0.3, # distance between label and the line representation borderaxespad=0.2, # distance between legend and the outer axes borderpad=0.3, # padding inside legend box numpoints=1, # put the marker only once ) # # legend.set_zorder(1) frame = legend.get_frame() frame.set_linewidth(0.0) frame.set_alpha(0.9) # 0.8 plt.xticks(xtick_lab, xtick_labels) plt.yticks(ytick_lab, ytick_lab) ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f')) grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', xlabel(r'Label Sparsity $(f)$', labelpad=0) # labelpad=0 ylabel(r'Accuracy', labelpad=0) xlim(xmin, xmax) ylim(ymin, ymax) if CREATE_PDF: savefig(join(figure_directory, fig_filename), format='pdf', dpi=None, edgecolor='w', orientation='portrait', transparent=False, bbox_inches='tight', pad_inches=0.05, frameon=None) if SHOW_PLOT: plt.show() if SHOW_PDF: showfig(join(figure_directory, fig_filename)) # shows actually created PDF
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False): # -- Setup CHOICE = choice CREATE_DATA = create_data ADD_DATA = add_data SHOW_PDF = show_pdf SHOW_PLOT = show_plot CREATE_PDF = create_pdf STD_FILL = True csv_filename = 'Fig_End-to-End_accuracy_VaryK_{}.csv'.format(CHOICE) header = ['currenttime', 'option', 'k', 'f', 'accuracy'] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # -- Default Graph parameters rep_SameGraph = 10 # iterations on same graph initial_h0 = None # initial vector to start finding optimal H distribution = 'powerlaw' exponent = -0.3 length = 5 variant = 1 EC = True # Non-backtracking for learning ymin = 0.3 ymax = 1 xmax = 8 xtick_lab = [2,3,4,5,6,7, 8] xtick_labels = ['2', '3', '4', '5', '6', '7', '8'] ytick_lab = np.arange(0, 1.1, 0.1) f_vec = [0.9 * pow(0.1, 1 / 5) ** x for x in range(21)] k_vec = [3, 4, 5 ] rep_DifferentGraphs = 10 # iterations on different graphs err = 0 avoidNeighbors = False gradient = False pruneRandom = False convergencePercentage_W = None stratified = True label_vec = ['*'] * 10 clip_on_vec = [False] * 10 draw_std_vec = range(10) numberOfSplits = 1 linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [5, 4, 3, 3] + [3] * 10 marker_vec = [None, None, 'o', 'x', 'o', '^', 'o', 'x', 'o', '^', 'o', 'x', 'o', '^'] markersize_vec = [0, 0, 4, 8] + [6] * 10 facecolor_vec = ["#4C72B0", "#55A868", "#C44E52", "#8172B2", "#CCB974", "#64B5CD"] # -- Options with propagation variants if CHOICE == 500: ## 1k nodes n = 1000 h = 8 d = 25 option_vec = ['opt1', 'opt2', 'opt3'] learning_method_vec = ['GS', 'MHE', 'DHE'] weight_vec = [10] * 3 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 2 + [True] xmin = 3. ymin = 0. ymax = 1. label_vec = ['GS', 'MCE', 'DCEr'] facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3 f_vec = [0.03, 0.01, 0.001] k_vec = [3, 4, 5, 6] elif CHOICE == 501: ## 10k nodes n = 10000 h = 8 d = 25 option_vec = ['opt1', 'opt2', 'opt3'] learning_method_vec = ['GT', 'MHE', 'DHE'] weight_vec = [10] * 3 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 2 + [True] xmin = 2. ymin = 0. ymax = 1. label_vec = ['GT', 'MCE', 'DCEr'] facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3 f_vec = [0.03, 0.01, 0.001] k_vec = [2, 3, 4, 5] elif CHOICE == 502: ## 10k nodes n = 10000 h = 8 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout'] weight_vec = [10] * 10 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] + [False] xmin = 2 ymin = 0.6 ymax = 1. label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout'] facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3 f_vec = [0.01] k_vec = [2, 3, 4, 5, 6, 7, 8] # option_vec = ['opt1', 'opt2', 'opt3', 'opt4'] # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE'] # k_vec = [2, 3, 4, 5] elif CHOICE == 503: ## 10k nodes n = 10000 h = 3 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout'] weight_vec = [10] * 10 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] + [False] xmin = 2 ymin = 0.3 ymax = 0.9 label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout'] facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3 f_vec = [0.01] k_vec = [2, 3, 4, 5, 6, 7, 8] # k_vec = [6, 7, 8] clip_on_vec = [True] * 10 # option_vec = ['opt1', 'opt2', 'opt3', 'opt4'] # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE'] # k_vec = [2, 3, 4, 5] elif CHOICE == 504: ## 10k nodes n = 10000 h = 3 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout'] weight_vec = [10] * 10 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] + [False] xmin = 2 xmax = 7 ymin = 0.2 ymax = 0.9 label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout'] facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3 f_vec = [0.01] # k_vec = [2, 3, 4, 5, 6, 7, 8] k_vec = [7] clip_on_vec = [True] * 10 elif CHOICE == 505: ## 10k nodes with f = 0.005 n = 10000 h = 3 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout'] weight_vec = [10] * 10 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] + [False] xmin = 2 xmax = 7 ymin = 0.2 ymax = 0.9 label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout'] facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3 f_vec = [0.005] k_vec = [2, 3, 4, 5, 6, 7] # k_vec = [7] clip_on_vec = [True] * 10 # elif CHOICE == 506: ## 10k nodes with f = 0.005 # n = 10000 # h = 3 # d = 25 # option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] # weight_vec = [10] * 10 # alpha_vec = [0] * 10 # beta_vec = [0] * 10 # gamma_vec = [0] * 10 # s_vec = [0.5] * 10 # numMaxIt_vec = [10] * 10 # randomize_vec = [False] * 4 + [True] + [False] # xmin = 2 # xmax = 7 # ymin = 0.2 # ymax = 0.9 # label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr'] # facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3 # f_vec = [0.005] # k_vec = [2,3,4,5,6,7] # # k_vec = [7] # clip_on_vec = [True] * 10 elif CHOICE == 506: ## 10k nodes n = 10000 h = 8 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [10] * 10 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] + [False] xmin = 2 xmax = 7 ymin = 0.2 ymax = 0.9 label_vec = ['GT', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout'] facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3 f_vec = [0.005] k_vec = [2, 3, 4, 5, 6, 7, 8] # k_vec = [5] clip_on_vec = [True] * 10 rep_SameGraph = 1 # iterations on same graph rep_DifferentGraphs = 1 # iterations on same graph elif CHOICE == 507: ## 10k nodes with gradient and PruneRandom n = 10000 h = 3 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['GS', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout'] weight_vec = [10] * 10 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] + [False] xmin = 2 ymin = 0.1 ymax = 0.9 label_vec = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout'] facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3 f_vec = [0.01] k_vec = [2, 3, 4, 5, 6, 7, 8] # k_vec = [6, 7, 8] clip_on_vec = [True] * 10 # option_vec = ['opt1', 'opt2', 'opt3', 'opt4'] # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE'] # k_vec = [2, 3, 4, 5] gradient = True pruneRandom = True elif CHOICE == 508: ## 10k nodes with gradient and PruneRandom n = 1000 h = 3 d = 10 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['GS', 'LHE', 'MHE', 'DHE', 'DHE', 'Holdout'] weight_vec = [10] * 10 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] + [False] xmin = 2 ymin = 0.1 ymax = 0.9 label_vec = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr', 'Holdout'] facecolor_vec = ['black'] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 3 f_vec = [0.01] k_vec = [2, 3, 4, 5, 6, 7, 8] # k_vec = [6, 7, 8] clip_on_vec = [True] * 10 # option_vec = ['opt1', 'opt2', 'opt3', 'opt4'] # learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE'] # k_vec = [2, 3, 4, 5] gradient = True pruneRandom = True rep_DifferentGraphs = 1 rep_SameGraph = 1 else: raise Warning("Incorrect choice!") RANDOMSEED = None # For repeatability random.seed(RANDOMSEED) # seeds some other python random generator np.random.seed(seed=RANDOMSEED) # seeds the actually used numpy random generator; both are used and thus needed # print("CHOICE: {}".format(CHOICE)) # -- Create data if CREATE_DATA or ADD_DATA: for i in range(rep_DifferentGraphs): # create several graphs with same parameters # print("\ni: {}".format(i)) for k in k_vec: # print("\nk: {}".format(k)) H0 = create_parameterized_H(k, h, symmetric=True) H0c = to_centering_beliefs(H0) a = [1.] * k alpha0 = np.array(a) alpha0 = alpha0 / np.sum(alpha0) W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d, distribution=distribution, exponent=exponent, directed=False, debug=False) X0 = from_dictionary_beliefs(Xd) for j in range(rep_SameGraph): # repeat several times for same graph # print("j: {}".format(j)) ind = None for f in f_vec: # Remove fraction (1-f) of rows from X0 (notice that different from first implementation) X1, ind = replace_fraction_of_rows(X0, 1-f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=ind, stratified=stratified) X2 = introduce_errors(X1, ind, err) for option_index, (learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \ enumerate(zip(learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)): # -- Learning if learning_method == 'GT': H2c = H0c elif learning_method == 'Holdout': H2 = estimateH_baseline_serial(X2, ind, W, numMax=numMaxIt, # ignore_rows=ind, numberOfSplits=numberOfSplits, # method=learning_method, variant=1, distance=length, EC=EC, alpha=alpha, beta=beta, gamma=gamma) H2c = to_centering_beliefs(H2) elif learning_method != 'DHE': H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize) H2c = to_centering_beliefs(H2) else: H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize, gradient=gradient, randomrestarts=pruneRandom) H2c = to_centering_beliefs(H2) # -- Propagation X2c = to_centering_beliefs(X2, ignoreZeroRows=True) # try without eps_max = eps_convergence_linbp_parameterized(H2c, W, method='noecho', alpha=alpha, beta=beta, gamma=gamma, X=X2) eps = s * eps_max try: F, actualIt, actualPercentageConverged = \ linBP_symmetric_parameterized(X2, W, H2c * eps, method='noecho', alpha=alpha, beta=beta, gamma=gamma, numMaxIt=numMaxIt, convergencePercentage=convergencePercentage_W, debug=2) except ValueError as e: print ( "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h)) else: accuracy_X = matrix_difference(X0, F, ignore_rows=ind) tuple = [str(datetime.datetime.now())] text = [option_vec[option_index], k, f, accuracy_X] # text = ['' if v is None else v for v in text] # TODO: test with vocabularies # text = np.asarray(text) # without np, entries get ugly format tuple.extend(text) # print("option: {}, f: {}, actualIt: {}, accuracy: {}".format(option_vec[option_index], f, actualIt, accuracy_X)) save_csv_record(join(data_directory, csv_filename), tuple) # -- Read, aggregate, and pivot data for all options df1 = pd.read_csv(join(data_directory, csv_filename)) # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15))) # -- Aggregate repetitions df2 = df1.groupby(['option', 'k', 'f']).agg \ ({'accuracy': [np.mean, np.std, np.size, np.median], # Multiple Aggregates }) df2.columns = ['_'.join(col).strip() for col in df2.columns.values] # flatten the column hierarchy df2.reset_index(inplace=True) # remove the index hierarchy df2.rename(columns={'accuracy_size': 'count'}, inplace=True) # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15))) # -- Pivot table df3 = pd.pivot_table(df2, index=['f', 'k'], columns=['option'], values=[ 'accuracy_mean', 'accuracy_std'] ) # Pivot # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30))) df3.columns = ['_'.join(col).strip() for col in df3.columns.values] # flatten the column hierarchy df3.reset_index(inplace=True) # remove the index hierarchy # df2.rename(columns={'time_size': 'count'}, inplace=True) # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(100))) # X_f = k_vec X_f = df3['k'].values # read k from values instead Y_hash = defaultdict(dict) Y_hash_std = defaultdict(dict) for f in f_vec: for option in option_vec: Y_hash[f][option] = list() Y_hash_std[f][option] = list() for f in f_vec: for option in option_vec: Y_hash[f][option] = df3.loc[df3['f'] == f]['accuracy_mean_{}'.format(option)].values Y_hash_std[f][option] = df3.loc[df3['f'] == f]['accuracy_std_{}'.format(option)].values if CREATE_PDF or SHOW_PLOT or SHOW_PDF: # -- Setup figure fig_filename = 'Fig_End-to-End_accuracy_varyK_{}.pdf'.format(CHOICE) mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']}) mpl.rcParams['axes.labelsize'] = 20 mpl.rcParams['xtick.labelsize'] = 16 mpl.rcParams['ytick.labelsize'] = 16 mpl.rcParams['legend.fontsize'] = 14 mpl.rcParams['grid.color'] = '777777' # grid color mpl.rcParams['xtick.major.pad'] = 2 # padding of tick labels: default = 4 mpl.rcParams['ytick.major.pad'] = 1 # padding of tick labels: default = 4 mpl.rcParams['xtick.direction'] = 'out' # default: 'in' mpl.rcParams['ytick.direction'] = 'out' # default: 'in' mpl.rcParams['axes.titlesize'] = 16 mpl.rcParams['figure.figsize'] = [4, 4] fig = figure() ax = fig.add_axes([0.13, 0.17, 0.8, 0.8]) opt_f_vecs = [(option, f) for option in option_vec for f in f_vec] for ((option, f), color, linewidth, clip_on, linestyle, marker, markersize) in \ zip(opt_f_vecs, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec): # label = learning_method_vec[option_vec.index(option)] label = label_vec[option_vec.index(option)] # label = label + " " + str(f) if STD_FILL: # print((X_f)) # print(Y_hash[f][option]) ax.fill_between(X_f, Y_hash[f][option] + Y_hash_std[f][option], Y_hash[f][option] - Y_hash_std[f][option], facecolor=color, alpha=0.2, edgecolor=None, linewidth=0) ax.plot(X_f, Y_hash[f][option] + Y_hash_std[f][option], linewidth=0.5, color='0.8', linestyle='solid') ax.plot(X_f, Y_hash[f][option] - Y_hash_std[f][option], linewidth=0.5, color='0.8', linestyle='solid') ax.plot(X_f, Y_hash[f][option], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker, markersize=markersize, markeredgewidth=1, markeredgecolor='black', clip_on=clip_on) if CHOICE==507: Y_f = [1/float(i) for i in X_f] ax.plot(X_f, Y_f, linewidth=2, color='black', linestyle='dashed', label='Random', zorder=4, marker='x', markersize=8, markeredgewidth=1, markeredgecolor='black', clip_on=clip_on) # -- Title and legend if distribution == 'uniform': distribution_label = ',$uniform' else: distribution_label = '$' if n < 1000: n_label='{}'.format(n) else: n_label = '{}k'.format(int(n / 1000)) title(r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.format(n_label, d, h, f, distribution_label)) handles, label_vec = ax.get_legend_handles_labels() legend = plt.legend(handles, label_vec, loc='upper right', # 'upper right' handlelength=2, labelspacing=0, # distance between label entries handletextpad=0.3, # distance between label and the line representation borderaxespad=0.2, # distance between legend and the outer axes borderpad=0.3, # padding inside legend box numpoints=1, # put the marker only once ) # # legend.set_zorder(1) frame = legend.get_frame() frame.set_linewidth(0.0) frame.set_alpha(0.9) # 0.8 # -- Figure settings and save plt.xticks(xtick_lab, xtick_labels) plt.yticks(ytick_lab, ytick_lab) ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f')) # Only show ticks on the left and bottom spines ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', xlabel(r'Number of Classes $(k)$', labelpad=0) # labelpad=0 ylabel(r'Accuracy', labelpad=0) xlim(xmin, xmax) ylim(ymin, ymax) if CREATE_PDF: savefig(join(figure_directory, fig_filename), format='pdf', dpi=None, edgecolor='w', orientation='portrait', transparent=False, bbox_inches='tight', pad_inches=0.05, frameon=None) if SHOW_PLOT: plt.show() if SHOW_PDF: showfig(join(figure_directory, fig_filename))
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False, show_arrows=False): # -- Setup CHOICE = choice CREATE_DATA = create_data ADD_DATA = add_data SHOW_PLOT = show_plot SHOW_PDF = show_pdf CREATE_PDF = create_pdf SHOW_STD = True ## FALSE for just scatter plot points SHOW_ARROWS = show_arrows # -- Default Graph parameters rep_SameGraph = 1 # iterations on same graph distribution = 'powerlaw' exponent = -0.3 length = 5 variant = 1 EC = False numberOfSplits = 1 scaling_vec = [None]*10 ymin = 0.3 ymax = 1 xmin = 1e-3 xmax = 1e3 xtick_lab = [1e-3, 0.01, 0.1, 1, 10, 100, 1000] xtick_labels = [r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$10^{2}$', r'$10^{3}$'] ytick_lab = np.arange(0, 1.1, 0.1) k = 3 a = 1 rep_DifferentGraphs = 1 # iterations on different graphs err = 0 avoidNeighbors = False convergencePercentage_W = 0.99 facecolor_vec = ["#4C72B0", "#55A868", "#8172B2", "#C44E52", "#CCB974", "#64B5CD"] label_vec = ['MCE', 'LCE', 'DCE', 'Holdout'] linewidth_vec = [4, 3, 1, 2, 2, 1] # clip_ons = [True, True, True, True, True, True] FILEZNAME = 'Fig_timing_accuracy_learning' marker_vec = ['s', '^', 'v', 'o', 'x', '+', 'None'] #'^' length_vec = [5] stratified = True f = 0.01 numMaxIt_vec = [10]*7 alpha_vec = [0] * 7 beta_vec = [0] * 7 # TODO: LinBP does not use beta. Also SSLH uses alpha, but not beta for W^row! Now fixed gamma_vec = [0] * 7 s_vec = [0.5] * 7 # -- Main Options if CHOICE == 1: # Main graph n = 1000 h = 3 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['MHE'] + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS'] label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS'] randomize_vec = [False]*3 + [True] + [None]*2 scaling_vec = [None]*2 + [10, 100] + [None]*2 splits_vec = [1, 2, 4, 8] elif CHOICE == 2: n = 1000 h = 3 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['MHE'] + ['LHE'] + ['DHE'] + ['DHE'] + ['GS'] label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'GS'] randomize_vec = [False]*3 + [True] + [None] scaling_vec = [None]*2 + [10, 100] + [None] elif CHOICE == 3: n = 1000 h = 3 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['MHE'] + ['LHE'] + ['DHE'] + ['DHE'] + ['GS'] label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'GS'] randomize_vec = [False]*3 + [True] + [None] scaling_vec = [None]*2 + [10, 100] + [None] f = 0.02 elif CHOICE == 4: # TODO: Overnight Wolfgang n = 1000 h = 3 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['MHE'] + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS'] label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS'] randomize_vec = [False]*3 + [True] + [None]*2 scaling_vec = [None]*2 + [10, 100] + [None]*2 splits_vec = [1, 2, 4, 8, 16] elif CHOICE == 5: # Toy graph with 100 nodes n = 100 h = 3 d = 8 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['MHE'] + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS'] label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS'] randomize_vec = [False]*3 + [True] + [None]*2 scaling_vec = [None]*2 + [10, 100] + [None]*2 splits_vec = [1, 2, 4, 8] f=0.05 elif CHOICE == 6: # To be run by Prakhar on Cluster n = 10000 h = 3 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['MHE'] + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS'] label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'GS'] randomize_vec = [False]*3 + [True] + [None]*2 scaling_vec = [None]*2 + [10, 100] + [None]*2 splits_vec = [1, 2, 4, 8] f=0.003 xmin = 1e-2 # ymax = 0.9 ymin = 0.2 ymax = 0.9 xmin = 1e-2 xmax = 1e3 elif CHOICE == 7: n = 1000 h = 3 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['MHE'] + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS'] label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS'] randomize_vec = [False]*3 + [True] + [None]*2 scaling_vec = [None]*2 + [10, 100] + [None]*2 splits_vec = [1, 2, 4, 8, 16] f=0.009 # elif CHOICE == 8: # not working well # n = 1000 # h = 3 # d = 25 # option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] # learning_method_vec = ['MHE'] + ['LHE'] + ['DHE'] + ['DHE'] + ['Holdout'] + ['GS'] # label_vec = ['MCE', 'LCE', 'DCE', 'DCE r', 'Holdout', 'GS'] # randomize_vec = [False]*3 + [True] + [None]*2 # scaling_vec = [None]*2 + [10, 100] + [None]*2 # splits_vec = [1, 2, 4, 8, 16] # f=0.005 else: raise Warning("Incorrect choice!") csv_filename = '{}_{}.csv'.format(FILEZNAME, CHOICE) header = ['currenttime', 'option', 'lensplit', 'f', 'accuracy', 'timetaken'] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) alpha0 = np.array([a, 1., 1.]) alpha0 = alpha0 / np.sum(alpha0) H0 = create_parameterized_H(k, h, symmetric=True) H0c = to_centering_beliefs(H0) RANDOMSEED = None # For repeatability random.seed(RANDOMSEED) # seeds some other python random generator np.random.seed(seed=RANDOMSEED) # seeds the actually used numpy random generator; both are used and thus needed # print("CHOICE: {}".format(CHOICE)) # -- Create data if CREATE_DATA or ADD_DATA: for i in range(rep_DifferentGraphs): # create several graphs with same parameters # print("\ni: {}".format(i)) W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d, distribution=distribution, exponent=exponent, directed=False, debug=False) X0 = from_dictionary_beliefs(Xd) for j in range(rep_SameGraph): # repeat several times for same graph # print("j: {}".format(j)) ind = None X1, ind = replace_fraction_of_rows(X0, 1-f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=ind, stratified = stratified) # TODO: stratified sampling option = True X2 = introduce_errors(X1, ind, err) for option_index, (learning_method, alpha, beta, gamma, s, numMaxIt, weight, randomize, option) in \ enumerate(zip(learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, scaling_vec, randomize_vec, option_vec)): # weight = np.array([np.power(scaling, i) for i in range(5)]) # TODO: now enough to specify weight as a scalar! H_est_dict = {} timeTaken_dict = {} # -- Learning if learning_method == 'Holdout' : for numberOfSplits in splits_vec: prev_time = time.time() H_est_dict[numberOfSplits] = estimateH_baseline_serial(X2, ind, W, numMax=numMaxIt, # ignore_rows=ind, numberOfSplits=numberOfSplits, # method=learning_method, variant=1, distance=length, EC=EC, weights=weight, alpha=alpha, beta=beta, gamma=gamma) timeTaken = time.time() - prev_time timeTaken_dict[numberOfSplits] = timeTaken elif learning_method in ['LHE', 'MHE', 'DHE']: # TODO: no smartInit, just randomization as option for length in length_vec: prev_time = time.time() H_est_dict[length] = estimateH(X2, W, method=learning_method, variant=1, randomize=randomize, distance=length, EC=EC, weights=weight) timeTaken = time.time() - prev_time timeTaken_dict[length] = timeTaken elif learning_method == 'GS': H_est_dict['GS'] = H0 for key in H_est_dict: H_est = H_est_dict[key] H2c = to_centering_beliefs(H_est) # print("H_estimated by {} is \n".format(learning_method), H_est) # print("H0 is \n", H0) # print("randomize was: ", randomize) # Propagation X2c = to_centering_beliefs(X2, ignoreZeroRows=True) # try without eps_max = eps_convergence_linbp_parameterized(H2c, W, method='noecho', alpha=alpha, beta=beta, gamma=gamma, X=X2) eps = s * eps_max # print("Max Eps ", eps_max) try: F, actualIt, actualPercentageConverged = \ linBP_symmetric_parameterized(X2, W, H2c * eps, method='noecho', alpha=alpha, beta=beta, gamma=gamma, numMaxIt=numMaxIt, convergencePercentage=convergencePercentage_W, convergenceThreshold=0.99, debug=2) except ValueError as e: print( "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h)) else: accuracy_X = matrix_difference(X0, F, ignore_rows=ind) tuple = [str(datetime.datetime.now())] if learning_method == 'Holdout': text = [option,"split{}".format(key), f, accuracy_X, timeTaken_dict[key]] elif learning_method in ['MHE', 'DHE', 'LHE']: text = [option, "len{}".format(key), f, accuracy_X, timeTaken_dict[key]] elif learning_method == 'GS': text = [option, 0, f, accuracy_X, 0] tuple.extend(text) # print("option: {}, f: {}, actualIt: {}, accuracy: {}".format(option, f, actualIt, accuracy_X)) save_csv_record(join(data_directory, csv_filename), tuple) # -- Read, aggregate, and pivot data for all options df1 = pd.read_csv(join(data_directory, csv_filename)) # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15))) # Aggregate repetitions df2 = df1.groupby(['option', 'lensplit', 'f']).agg \ ({'accuracy': [np.mean, np.std, np.size], # Multiple Aggregates }) df2.columns = ['_'.join(col).strip() for col in df2.columns.values] # flatten the column hierarchy df2.reset_index(inplace=True) # remove the index hierarchy df2.rename(columns={'accuracy_size': 'count'}, inplace=True) # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15))) df3 = df1.groupby(['option', 'lensplit', 'f']).agg({'timetaken': [np.median] }) df3.columns = ['_'.join(col).strip() for col in df3.columns.values] # flatten the column hierarchy df3.reset_index(inplace=True) # remove the index hierarchy # resultdf3 = df3.sort(['timetaken'], ascending=1) # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(15))) X_time_median_dict = {} Y_acc_dict = {} Y_std_dict = {} for option in option_vec: Y_acc_dict[option] = df2.loc[(df2['option'] == option), "accuracy_mean"].values Y_std_dict[option] = df2.loc[(df2['option'] == option), "accuracy_std"].values X_time_median_dict[option] = df3.loc[(df3['option'] == option), "timetaken_median"].values # print("option: ", option) # print("Y_acc_dict[option]: ", Y_acc_dict[option]) # print("Y_std_dict[option]: ", Y_std_dict[option]) # print("X_time_median_dict[option]: ", X_time_median_dict[option]) # -- Setup figure fig_filename = '{}_{}.pdf'.format(FILEZNAME, CHOICE) mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']}) mpl.rcParams['axes.labelsize'] = 18 mpl.rcParams['xtick.labelsize'] = 16 mpl.rcParams['ytick.labelsize'] = 16 mpl.rcParams['axes.titlesize'] = 16 mpl.rcParams['legend.fontsize'] = 14 mpl.rcParams['grid.color'] = '777777' # grid color mpl.rcParams['xtick.major.pad'] = 2 # padding of tick labels: default = 4 mpl.rcParams['ytick.major.pad'] = 1 # padding of tick labels: default = 4 mpl.rcParams['xtick.direction'] = 'out' # default: 'in' mpl.rcParams['ytick.direction'] = 'out' # default: 'in' mpl.rcParams['figure.figsize'] = [4, 4] fig = figure() ax = fig.add_axes([0.13, 0.17, 0.8, 0.8]) SHOW_ARROWS = True for choice, color, learning_method, label, linewidth, marker in \ zip(option_vec, facecolor_vec, learning_method_vec, label_vec, linewidth_vec, marker_vec): if learning_method == 'Holdout': # Draw std X1 = X_time_median_dict[choice] s = X1.argsort() X1 = X1[s] Y1 = Y_acc_dict[choice][s] Y2 = Y_std_dict[choice][s] if SHOW_STD: ax.fill_between(X1, Y1 + Y2, Y1 - Y2, facecolor=color, alpha=0.2, edgecolor=None, linewidth=0) ax.plot(X1, Y1 + Y2, linewidth=0.5, color='0.8', linestyle='solid') ax.plot(X1, Y1 - Y2, linewidth=0.5, color='0.8', linestyle='solid') ax.set_ylim(bottom=ymin) ax.plot(X1, Y1, linewidth=linewidth, color=color, linestyle='solid', label=label, zorder=20, marker='x', markersize=linewidth + 5, markeredgewidth=1) ax.annotate(np.round(X1[1], decimals=1), xy=(X1[1], Y1[1] - 0.05), color=color, va='center', annotation_clip=False, zorder=5) else: ax.scatter(list(X1), list(Y1), color=color, label=label, marker='x', s=42) elif learning_method == 'GS': ax.plot([1e-4, 1e4], [Y_acc_dict[choice], Y_acc_dict[choice]], linewidth=1, color='black', linestyle='dashed', zorder=0, marker=None, label=label, ) else: # For all other if SHOW_STD: ax.errorbar(list(X_time_median_dict[choice]), list(Y_acc_dict[choice]), yerr=Y_std_dict[choice], fmt='-o', linewidth=2, color=color, label=label, marker=marker, markersize=8) ax.annotate(np.round(X_time_median_dict[choice], decimals=2), xy=(X_time_median_dict[choice], Y_acc_dict[choice]-0.05), color=color, va='center', annotation_clip=False, zorder=5) else: ax.scatter(list(X_time_median_dict[choice]), list(Y_acc_dict[choice]), color=color, label=label, marker=marker, s=42) if SHOW_ARROWS: dce_opt = 'opt4' holdout_opt = 'opt5' ax.annotate(s='', xy=(X_time_median_dict[dce_opt], Y_acc_dict[dce_opt]-0.3), xytext=(X_time_median_dict[holdout_opt][2]+0.02, Y_acc_dict[dce_opt]-0.3), arrowprops=dict(arrowstyle='<->')) ax.annotate(str(int(np.round(X_time_median_dict[holdout_opt][2] / X_time_median_dict[dce_opt]))) + 'x', xy=((X_time_median_dict[dce_opt] + X_time_median_dict[holdout_opt][2])/100, Y_acc_dict[dce_opt]-0.28), color='black', va='center', # bbox = dict(boxstyle="round,pad=0.3", fc="w"), annotation_clip=False, zorder=5) # -- Title and legend title(r'$\!\!\!n\!=\!{}\mathrm{{k}}, d\!=\!{}, h\!=\!{}, f\!=\!{}$'.format(int(n / 1000), d, h, f)) handles, label_vec = ax.get_legend_handles_labels() for i, (h, learning_method) in enumerate(zip(handles, learning_method_vec)): # remove error bars in legend if isinstance(handles[i], collections.Container): handles[i] = handles[i][0] # plt.legend(loc='upper left', numpoints=1, ncol=3, fontsize=8, bbox_to_anchor=(0, 0)) SHOW_STD = False legend = plt.legend(handles, label_vec, loc='upper right', # 'upper right' handlelength=2, fontsize=12, labelspacing=0.2, # distance between label entries handletextpad=0.3, # distance between label and the line representation borderaxespad=0.2, # distance between legend and the outer axes borderpad=0.3, # padding inside legend box numpoints=1, # put the marker only once ) if not(SHOW_STD): legend = plt.legend(handles, label_vec, loc='upper right', # 'upper right' handlelength=2, fontsize=10, labelspacing=0.2, # distance between label entries handletextpad=0.3, # distance between label and the line representation borderaxespad=0.2, # distance between legend and the outer axes borderpad=0.3, # padding inside legend box numpoints=1, # put the marker only once scatterpoints=1 # display only one-scatter point in legend ) # # legend.set_zorder(1) frame = legend.get_frame() frame.set_linewidth(0.0) frame.set_alpha(0.9) # 0.8 # -- Figure settings and save plt.xscale('log') plt.xticks(xtick_lab, xtick_labels) plt.yticks(ytick_lab, ytick_lab) ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f')) ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') ax.set_ylim(bottom=ymin) grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', xlim(xmin, xmax) ylim(ymin, ymax) xlabel(r'Time Median (sec)', labelpad=0) # labelpad=0 ylabel(r'Accuracy', labelpad=0) if CREATE_PDF: savefig(join(figure_directory, fig_filename), format='pdf', dpi=None, edgecolor='w', orientation='portrait', transparent=False, bbox_inches='tight', pad_inches=0.05, frameon=None) if SHOW_PDF: showfig(join(figure_directory, fig_filename)) if SHOW_PLOT: plt.show()
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False): # -- Setup CHOICE = choice CREATE_DATA = create_data ADD_DATA = add_data SHOW_PLOT = show_plot SHOW_PDF = show_pdf CREATE_PDF = create_pdf SHOW_ARROWS = False STD_FILL = False CALCULATE_DATA_STATISTICS = False csv_filename = 'Fig_timing_VaryK_{}.csv'.format(CHOICE) header = ['currenttime', 'option', 'k', 'f', 'time'] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # -- Default Graph parameters rep_SameGraph = 2 # iterations on same graph initial_h0 = None # initial vector to start finding optimal H distribution = 'powerlaw' exponent = -0.3 length = 5 variant = 1 EC = True # Non-backtracking for learning ymin = 0.0 ymax = 1 xmin = 2 xmax = 7.5 xtick_lab = [2, 3, 4, 5, 6, 7, 8] xtick_labels = ['2', '3', '4', '5', '6', '7', '8'] ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 50] ytick_labels = [ r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$50$' ] f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] k_vec = [3, 4, 5] rep_DifferentGraphs = 1000 # iterations on different graphs err = 0 avoidNeighbors = False gradient = False convergencePercentage_W = None stratified = True label_vec = ['*'] * 10 clip_on_vec = [True] * 15 draw_std_vec = range(10) numberOfSplits = 1 linestyle_vec = ['solid'] * 15 linewidth_vec = [3, 2, 4, 2, 3, 2] + [3] * 15 marker_vec = ['^', 's', 'o', 'x', 'o', '+', 's'] * 3 markersize_vec = [8, 7, 8, 10, 7, 6] + [10] * 10 facecolor_vec = [ "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#64B5CD" ] legend_location = 'upper right' # -- Options with propagation variants if CHOICE == 600: ## 1k nodes n = 1000 h = 8 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4'] learning_method_vec = ['GT', 'MHE', 'DHE', 'Holdout'] weight_vec = [10] * 4 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] xmin = 3. xmax = 10. ymin = 0. ymax = 50. label_vec = ['GT', 'MCE', 'DCE', 'Holdout'] facecolor_vec = [ 'black' ] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 4 f_vec = [0.03, 0.01, 0.001] k_vec = [3, 4, 5, 6] ytick_lab = [0, 1e-3, 1e-2, 1e-1, 1, 10, 50] ytick_labels = [ r'$0$', r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$50$' ] elif CHOICE == 601: ## 10k nodes n = 10000 h = 8 d = 25 option_vec = ['opt1', 'opt2', 'opt3', 'opt4'] learning_method_vec = ['GT', 'MHE', 'DHE', 'Holdout'] weight_vec = [10] * 4 alpha_vec = [0] * 20 beta_vec = [0] * 20 gamma_vec = [0] * 20 s_vec = [0.5] * 20 numMaxIt_vec = [10] * 20 randomize_vec = [False] * 15 + [True] xmin = 3. xmax = 8. ymin = 0. ymax = 500. label_vec = ['GT', 'MCE', 'DCE', 'Holdout'] facecolor_vec = [ 'black' ] + ["#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974"] * 4 f_vec = [0.03, 0.01, 0.001] k_vec = [3, 4, 5] ytick_lab = [0, 1e-3, 1e-2, 1e-1, 1, 10, 100, 300] ytick_labels = [ r'$0$', r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$100$', r'$300$' ] elif CHOICE == 602: ## 10k nodes n = 10000 h = 8 d = 25 weight_vec = [10] * 20 alpha_vec = [0] * 20 beta_vec = [0] * 20 gamma_vec = [0] * 20 s_vec = [0.5] * 20 numMaxIt_vec = [10] * 20 randomize_vec = [False] * 3 + [True] + [False] ymin = 0.01 ymax = 500 label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DHEr'] facecolor_vec = [ "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974" ] * 4 f_vec = [0.01] k_vec = [3, 4, 5] ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500] ytick_labels = [ r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$100$', r'$500$' ] option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4'] learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE'] k_vec = [2, 3, 4, 5, 6, 7, 8] # option_vec = ['opt2', 'opt3', 'opt6'] # learning_method_vec = ['MHE', 'DHE', 'LHE'] # k_vec = [2, 3, 4, 5] elif CHOICE == 603: ## 10k nodes n = 10000 h = 3 d = 25 weight_vec = [10] * 20 alpha_vec = [0] * 20 beta_vec = [0] * 20 gamma_vec = [0] * 20 s_vec = [0.5] * 20 numMaxIt_vec = [10] * 20 randomize_vec = [False] * 4 + [True] xmin = 1.8 xmax = 8.2 ymin = 0.01 ymax = 500 label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr'] facecolor_vec = [ "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52" ] * 4 f_vec = [0.01] k_vec = [3, 4, 5] ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500] ytick_labels = [ r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$100$', r'$500$' ] option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4'] learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE'] k_vec = [2, 3, 4, 5, 6, 7, 8] legend_location = 'upper right' # option_vec = ['opt2', 'opt3', 'opt6'] # learning_method_vec = ['MHE', 'DHE', 'LHE'] # k_vec = [2, 3, 4, 5] # option_vec = ['opt4', 'opt3'] # learning_method_vec = ['MHE', 'MHE'] # randomize_vec = [True, False] # k_vec = [2, 3, 4, 5] elif CHOICE == 604: ## 10k nodes with Gradient n = 10000 h = 3 d = 25 weight_vec = [10] * 20 alpha_vec = [0] * 20 beta_vec = [0] * 20 gamma_vec = [0] * 20 s_vec = [0.5] * 20 numMaxIt_vec = [10] * 20 randomize_vec = [False] * 4 + [True] ymin = 0.00 ymax = 800 label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr'] facecolor_vec = [ "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52" ] * 4 f_vec = [0.01] k_vec = [3, 4, 5] ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500] ytick_labels = [ r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$100$', r'$500$' ] option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4'] learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE'] k_vec = [2, 3, 4, 5, 6, 7, 8] # k_vec = [7, 8] gradient = True legend_location = 'center right' elif CHOICE == 605: ## 10k nodes with Gradient with f = 0.005 n = 10000 h = 3 d = 25 weight_vec = [10] * 20 alpha_vec = [0] * 20 beta_vec = [0] * 20 gamma_vec = [0] * 20 s_vec = [0.5] * 20 numMaxIt_vec = [10] * 20 randomize_vec = [False] * 4 + [True] ymin = 0.00 ymax = 800 label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr'] facecolor_vec = [ "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52" ] * 4 f_vec = [0.005] k_vec = [3, 4, 5] ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500] ytick_labels = [ r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$100$', r'$500$' ] option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4'] learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE'] k_vec = [2, 3, 4, 5, 6, 7] # k_vec = [7, 8] gradient = True legend_location = 'center right' elif CHOICE == 606: ## 10k nodes with Gradient with f = 0.005 and Gradient and PruneRandom n = 10000 h = 3 d = 25 weight_vec = [10] * 20 alpha_vec = [0] * 20 beta_vec = [0] * 20 gamma_vec = [0] * 20 s_vec = [0.5] * 20 numMaxIt_vec = [10] * 20 randomize_vec = [False] * 4 + [True] xmin = 1.8 xmax = 7.2 ymin = 0.01 ymax = 800 label_vec = ['Holdout', 'LCE', 'MCE', 'DCE', 'DCEr'] facecolor_vec = [ "#CCB974", "#55A868", "#4C72B0", "#8172B2", "#C44E52" ] * 4 f_vec = [0.005] k_vec = [3, 4, 5] ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500] ytick_labels = [ r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$100$', r'$500$' ] option_vec = ['opt5', 'opt6', 'opt2', 'opt3', 'opt4'] learning_method_vec = ['Holdout', 'LHE', 'MHE', 'DHE', 'DHE'] k_vec = [2, 3, 4, 5, 6, 7] gradient = True pruneRandom = True legend_location = 'upper right' elif CHOICE == 607: ## 10k nodes with gradient and PruneRandom n = 10000 h = 3 d = 25 option_vec = ['opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['LHE', 'MHE', 'DHE', 'DHE', 'Holdout'] weight_vec = [10] * 10 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 3 + [True] + [False] xmin = 1.8 xmax = 7. ymin = 0.01 ymax = 800 label_vec = ['LCE', 'MCE', 'DCE', 'DCEr', 'Holdout'] facecolor_vec = [ "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974" ] * 4 legend_location = 'upper left' marker_vec = [None, 's', 'x', 'o', '^', '+'] * 3 markersize_vec = [8, 7, 10, 8, 7, 6] + [10] * 10 f_vec = [0.01] k_vec = [2, 3, 4, 5, 6, 7, 8] clip_on_vec = [True] * 10 gradient = True pruneRandom = True ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500] ytick_labels = [ r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$100$', r'$500$' ] elif CHOICE == 608: ## 10k nodes with gradient and PruneRandom n = 10000 h = 3 d = 25 option_vec = ['opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['LHE', 'MHE', 'DHE', 'DHE', 'Holdout'] weight_vec = [10] * 10 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 numMaxIt_vec = [10] * 10 randomize_vec = [False] * 3 + [True] + [False] xmin = 1.8 xmax = 7.2 ymin = 0.01 ymax = 800 label_vec = ['LCE', 'MCE', 'DCE', 'DCEr', 'Holdout'] facecolor_vec = [ "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974" ] * 4 legend_location = 'upper left' marker_vec = [None, 's', 'x', 'o', '^', '+'] * 3 markersize_vec = [8, 7, 10, 8, 7, 6] + [10] * 10 f_vec = [0.01] k_vec = [2, 3, 4, 5, 6, 7, 8] clip_on_vec = [True] * 10 gradient = True pruneRandom = True ytick_lab = [1e-3, 1e-2, 1e-1, 1, 10, 100, 500] ytick_labels = [ r'$10^{-3}$', r'$10^{-2}$', r'$10^{-1}$', r'$1$', r'$10$', r'$100$', r'$500$' ] rep_DifferentGraphs = 10 else: raise Warning("Incorrect choice!") RANDOMSEED = None # For repeatability random.seed(RANDOMSEED) # seeds some other python random generator np.random.seed( seed=RANDOMSEED ) # seeds the actually used numpy random generator; both are used and thus needed # print("CHOICE: {}".format(CHOICE)) # -- Create data if CREATE_DATA or ADD_DATA: for i in range(rep_DifferentGraphs ): # create several graphs with same parameters # print("\ni: {}".format(i)) for k in k_vec: # print("\nk: {}".format(k)) H0 = create_parameterized_H(k, h, symmetric=True) H0c = to_centering_beliefs(H0) a = [1.] * k alpha0 = np.array(a) alpha0 = alpha0 / np.sum(alpha0) W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d, distribution=distribution, exponent=exponent, directed=False, debug=False) X0 = from_dictionary_beliefs(Xd) for j in range( rep_SameGraph): # repeat several times for same graph # print("j: {}".format(j)) ind = None for f in f_vec: # Remove fraction (1-f) of rows from X0 (notice that different from first implementation) X1, ind = replace_fraction_of_rows( X0, 1 - f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=ind, stratified=stratified) X2 = introduce_errors(X1, ind, err) for option_index, (learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \ enumerate(zip(learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)): # -- Learning if learning_method == 'GT': timeTaken = 0.0 elif learning_method == 'Holdout': prev_time = time.time() H2 = estimateH_baseline_serial( X2, ind, W, numMax=numMaxIt, numberOfSplits=numberOfSplits, EC=EC, alpha=alpha, beta=beta, gamma=gamma) timeTaken = time.time() - prev_time else: prev_time = time.time() if gradient and pruneRandom: H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize, gradient=gradient) else: H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=randomize) timeTaken = time.time() - prev_time tuple = [str(datetime.datetime.now())] text = [option_vec[option_index], k, f, timeTaken] tuple.extend(text) # print("option: {}, f: {}, timeTaken: {}".format(option_vec[option_index], f, timeTaken)) save_csv_record(join(data_directory, csv_filename), tuple) # -- Read, aggregate, and pivot data for all options df1 = pd.read_csv(join(data_directory, csv_filename)) # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(15))) # -- Aggregate repetitions df2 = df1.groupby(['option', 'k', 'f']).agg \ ({'time': [np.mean, np.std, np.size, np.median], # Multiple Aggregates }) df2.columns = ['_'.join(col).strip() for col in df2.columns.values ] # flatten the column hierarchy df2.reset_index(inplace=True) # remove the index hierarchy df2.rename(columns={'time_size': 'count'}, inplace=True) # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15))) # -- Pivot table df3 = pd.pivot_table(df2, index=['f', 'k'], columns=['option'], values=['time_mean', 'time_std', 'time_median']) # Pivot # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30))) df3.columns = ['_'.join(col).strip() for col in df3.columns.values ] # flatten the column hierarchy df3.reset_index(inplace=True) # remove the index hierarchy # df2.rename(columns={'time_size': 'count'}, inplace=True) # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(100))) # X_f = k_vec X_f = df3['k'].values # read k from values instead Y_hash = defaultdict(dict) Y_hash_std = defaultdict(dict) for f in f_vec: for option in option_vec: Y_hash[f][option] = list() Y_hash_std[f][option] = list() for f in f_vec: for option in option_vec: Y_hash[f][option] = df3.loc[df3['f'] == f]['time_mean_{}'.format( option)].values # mean # Y_hash[f][option] = df3.loc[df3['f'] == f]['time_median_{}'.format(option)].values # median Y_hash_std[f][option] = df3.loc[df3['f'] == f][ 'time_std_{}'.format(option)].values if SHOW_PLOT or SHOW_PDF or CREATE_PDF: # -- Setup figure fig_filename = 'Fig_Time_varyK_{}.pdf'.format(CHOICE) mpl.rc( 'font', **{ 'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans'] }) mpl.rcParams['axes.labelsize'] = 20 mpl.rcParams['xtick.labelsize'] = 16 mpl.rcParams['ytick.labelsize'] = 16 mpl.rcParams['legend.fontsize'] = 14 mpl.rcParams['grid.color'] = '777777' # grid color mpl.rcParams[ 'xtick.major.pad'] = 2 # padding of tick labels: default = 4 mpl.rcParams[ 'ytick.major.pad'] = 1 # padding of tick labels: default = 4 mpl.rcParams['xtick.direction'] = 'out' # default: 'in' mpl.rcParams['ytick.direction'] = 'out' # default: 'in' mpl.rcParams['axes.titlesize'] = 16 mpl.rcParams['figure.figsize'] = [4, 4] fig = figure() ax = fig.add_axes([0.13, 0.17, 0.8, 0.8]) opt_f_vecs = [(option, f) for option in option_vec for f in f_vec] for ((option, f), color, linewidth, clip_on, linestyle, marker, markersize) in \ zip(opt_f_vecs, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec): label = label_vec[option_vec.index(option)] # label = label + " " + str(f) if STD_FILL: ax.fill_between(X_f, Y_hash[f][option] + Y_hash_std[f][option], Y_hash[f][option] - Y_hash_std[f][option], facecolor=color, alpha=0.2, edgecolor=None, linewidth=0) ax.plot(X_f, Y_hash[f][option] + Y_hash_std[f][option], linewidth=0.5, color='0.8', linestyle='solid') ax.plot(X_f, Y_hash[f][option] - Y_hash_std[f][option], linewidth=0.5, color='0.8', linestyle='solid') ax.plot(X_f, Y_hash[f][option], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker, markersize=markersize, markeredgecolor='black', markeredgewidth=1, clip_on=clip_on) if SHOW_ARROWS: for indx in [2, 3]: ax.annotate(s='', xy=(X_f[indx] - 0.05, Y_hash[f]['opt4'][indx]), xytext=(X_f[indx] - 0.05, Y_hash[f]['opt5'][indx]), arrowprops=dict(facecolor='blue', arrowstyle='<->')) ax.annotate( str( int( np.round(Y_hash[f]['opt5'][indx] / Y_hash[f]['opt4'][indx]))) + 'x', xy=(X_f[indx] - 0.4, (Y_hash[f]['opt5'][indx] + Y_hash[f]['opt4'][indx]) / 10), color='black', va='center', annotation_clip=False, zorder=5) # -- Title and legend if distribution == 'uniform': distribution_label = ',$uniform' else: distribution_label = '$' if n < 1000: n_label = '{}'.format(n) else: n_label = '{}k'.format(int(n / 1000)) title(r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}, f\!=\!{}{}'.format( n_label, d, h, f, distribution_label)) handles, label_vec = ax.get_legend_handles_labels() legend = plt.legend( handles, label_vec, loc=legend_location, # 'upper right' handlelength=2, labelspacing=0, # distance between label entries handletextpad= 0.3, # distance between label and the line representation borderaxespad=0.2, # distance between legend and the outer axes borderpad=0.3, # padding inside legend box numpoints=1, # put the marker only once ) # # legend.set_zorder(1) frame = legend.get_frame() frame.set_linewidth(0.0) frame.set_alpha(0.9) # 0.8 # -- Figure settings and save plt.yscale('log') plt.xticks(xtick_lab, xtick_labels) plt.yticks(ytick_lab, ytick_lab) # Only show ticks on the left and bottom spines ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', xlabel(r'Number of Classes $(k)$', labelpad=0) # labelpad=0 ylabel(r'Time [sec]', labelpad=0) if CREATE_PDF: savefig(join(figure_directory, fig_filename), format='pdf', dpi=None, edgecolor='w', orientation='portrait', transparent=False, bbox_inches='tight', pad_inches=0.05, frameon=None) if SHOW_PLOT: plt.show() if SHOW_PDF: showfig(join(figure_directory, fig_filename)) # shows actually created PDF
def test_planted_distribution_model(): """ Tests the main graph generator with statistics and visualized degree distribution and edge adjacency matrix """ print("\n--- 'planted_distribution_model_H', 'planted_distribution_model_P', 'number_of_connectedComponents', 'create_blocked_matrix_from_graph' --") CHOICE = 21 print("CHOICE:", CHOICE) debug = 0 # directed = True # !!! TODO: not yet clear what undirected means here, only P accepts directed backEdgesAllowed = True # ??? should be enforced in code sameInAsOutDegreeRanking = False distribution = 'powerlaw' exponent = -0.3 VERSION_P = True # --- AAAI figures --- if CHOICE in [1, 2, 3, 4, 5, 6]: n = 120 alpha0 = [1/6, 1/3, 1/2] h = 8 P = np.array([[1, h, 1], [1, 1, h], [h, 1, 1]]) if CHOICE == 1: # P (equivalent to 2), AAAI 2 m = 1080 elif CHOICE == 2: # H (equivalent to 1) H0 = row_normalize_matrix(P) d_vec = [18, 9, 6] VERSION_P = False elif CHOICE == 3: # H (equivalent to 4), AAAI 3 H0 = row_normalize_matrix(P) d_vec = 9 VERSION_P = False elif CHOICE == 4: # P (equivalent to 3) P = np.array([[1, h, 1], [2, 2, 2*h], [3*h, 3, 3]]) m = 1080 elif CHOICE == 5: # H (equivalent to 2), but backedges=False H0 = row_normalize_matrix(P) d_vec = [18, 9, 6] VERSION_P = False backEdgesAllowed = False elif CHOICE == 6: # P undirected, AAAI 4 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) directed = False backEdgesAllowed = False m = 540 # --- AGAIN DIRECTED --- if CHOICE == 12: n = 1001 alpha0 = [0.6, 0.2, 0.2] P = np.array([[0.1, 0.8, 0.1], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8]]) m = 3000 distribution = 'uniform' # uniform powerlaw exponent = None backEdgesAllowed = False # ??? should be enforced in code if CHOICE == 13: # Nice for block matrix visualization n = 1000 alpha0 = [0.334, 0.333, 0.333] h = 2 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) m = 2000 distribution = 'uniform' # uniform powerlaw exponent = None backEdgesAllowed = False # ??? should be enforced in code if CHOICE == 14: n = 1000 alpha0 = [0.3334, 0.3333, 0.3333] h = 10 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) m = 10000 exponent = -0.55 # --- UNDIRECTED --- if CHOICE == 20: n = 100 alpha0 = [0.6, 0.2, 0.2] h = 1.4 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) H0 = row_normalize_matrix(P) d_vec = 5 directed = False exponent = -0.3 VERSION_P = False elif CHOICE == 21: n = 1001 alpha0 = [0.6, 0.2, 0.2] h = 4 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) H0 = row_normalize_matrix(P) d_vec = 3.4 # don't specify vector for undirected distribution = 'uniform' # uniform powerlaw exponent = -0.5 directed = False backEdgesAllowed = True # ignored in code for undirected VERSION_P = False sameInAsOutDegreeRanking = True # ignored in code for undirected elif CHOICE == 22: n = 1000 m = 3000 alpha0 = [0.6, 0.2, 0.2] h = 4 P = np.array([[1, 3*h, 1], [2*h, 1, 1], [1, 1, h]]) distribution = 'uniform' # uniform powerlaw exponent = -0.5 directed = False backEdgesAllowed = False # ignored in code for undirected sameInAsOutDegreeRanking = True # ignored in code for undirected debug=0 VERSION_P = True H0 = row_normalize_matrix(P) # --- Create the graph start = time.time() if VERSION_P: W, Xd = planted_distribution_model(n, alpha=alpha0, P=P, m=m, distribution=distribution, exponent=exponent, directed=directed, backEdgesAllowed=backEdgesAllowed, sameInAsOutDegreeRanking=sameInAsOutDegreeRanking, debug=debug) else: W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d_vec, distribution=distribution, exponent=exponent, directed=directed, backEdgesAllowed=backEdgesAllowed, sameInAsOutDegreeRanking=sameInAsOutDegreeRanking, debug=debug) time_est = time.time()-start print("Time for graph generation: {}".format(time_est)) # - Undirectd degrees: In + Out W_und = W.multiply(W.transpose()) """if backEdgesAllowed then there can be edges in both directions.""" # W_und.data[:] = np.sign(W_und.data) # W contains weighted edges -> unweighted before counting edges with Ptot print("Fraction of edges that go in both directions: {}".format(np.sum(W_und.data) / np.sum(W.data))) # --- Statistics on created graph print("\n- 'calculate_Ptot_from_graph':") P_tot = calculate_Ptot_from_graph(W, Xd) print("P_tot:\n{}".format(P_tot)) print("sum(P_tot): {}".format(np.sum(P_tot))) print("P (normalized to sum=1):\n{}".format(1. * P_tot / np.sum(P_tot))) # Potential: normalized sum = 1 H = row_normalize_matrix(P_tot) print("H (row-normalized):\n{}".format(H)) print("\n- 'calculate_nVec_from_Xd':") n_vec = calculate_nVec_from_Xd(Xd) print("n_vec: {}".format(n_vec)) print("alpha: {}".format(1.*n_vec / sum(n_vec))) print("\n- Average Out/Indegree 'calculate_average_outdegree_from_graph' (assumes directed for total; for undirected the totals are incorrect):") print("Average outdegree: {}".format(calculate_average_outdegree_from_graph(W))) print("Average indegree: {}".format(calculate_average_outdegree_from_graph(W.transpose()))) print("Average total degree: {}".format(calculate_average_outdegree_from_graph(W + W.transpose()))) print("Average outdegree per class: {}".format(calculate_average_outdegree_from_graph(W, Xd))) print("Average indegree per class: {}".format(calculate_average_outdegree_from_graph(W.transpose(), Xd))) print("Average total degree per class: {}".format(calculate_average_outdegree_from_graph(W + W.transpose(), Xd))) # - Overall degree distribution: In / out print("\n- Overall Out/In/Total degree distribution 'calculate_outdegree_distribution_from_graph':") print("Overall Out and Indegree distribution:") d_out_vec_tot = calculate_outdegree_distribution_from_graph(W, Xd=None) d_in_vec_tot = calculate_outdegree_distribution_from_graph(W.transpose(), Xd=None) print("Outdegree distribution (degree / number):\n{}".format(np.array([d_out_vec_tot.keys(), d_out_vec_tot.values()]))) print("Indegree distribution (degree / number):\n{}".format(np.array([d_in_vec_tot.keys(), d_in_vec_tot.values()]))) # - Overall degree distribution: In + Out d_tot_vec_tot = calculate_outdegree_distribution_from_graph(W + W.transpose(), Xd=None) print("Total degree distribution (degree / number):\n{}".format(np.array([d_tot_vec_tot.keys(), d_tot_vec_tot.values()]))) # - Per-class degree distribution: In / out print("\n- Per-class Out/In/Total degree distribution 'calculate_outdegree_distribution_from_graph':") print("\nOutdegree distribution per class:") d_out_vec = calculate_outdegree_distribution_from_graph(W, Xd) for i in range(len(d_out_vec)): print("Class {}:".format(i)) print(np.array([d_out_vec[i].keys(), d_out_vec[i].values()])) print("Indegree distribution per class:") d_in_vec = calculate_outdegree_distribution_from_graph(W.transpose(), Xd) for i in range(len(d_in_vec)): print("Class {}:".format(i)) print(np.array([d_in_vec[i].keys(), d_in_vec[i].values()])) # - per-class degree distribution: In + out print("\nTotal degree distribution per class:") d_vec_und = calculate_outdegree_distribution_from_graph(W + W.transpose(), Xd) for i in range(len(d_vec_und)): print("Class {}:".format(i)) print(np.array([d_vec_und[i].keys(), d_vec_und[i].values()])) print("\n- number of weakly connected components':") print("Number of weakly connected components: {}".format(connected_components(W, directed=True, connection='weak', return_labels=False))) # --- convergence boundary # print("\n- '_out_eps_convergence_directed_linbp', 'eps_convergence_linbp'") # if directed: # eps_noEcho = _out_eps_convergence_directed_linbp(P, W, echo=False) # eps_Echo = _out_eps_convergence_directed_linbp(P, W, echo=True) # else: Hc = to_centering_beliefs(H) eps_noEcho = eps_convergence_linbp(Hc, W, echo=False) eps_Echo = eps_convergence_linbp(Hc, W, echo=True) print("Eps (w/ echo): {}".format(eps_Echo)) print("Eps (no echo): {}".format(eps_noEcho)) # --- Fig1: Draw edge distributions print("\n- Fig1: Draw degree distributions") params = {'backend': 'pdf', 'lines.linewidth': 4, 'font.size': 10, 'axes.labelsize': 24, # fontsize for x and y labels (was 10) 'axes.titlesize': 22, 'xtick.labelsize': 20, 'ytick.labelsize': 20, 'legend.fontsize': 8, 'figure.figsize': [5, 4], 'font.family': 'sans-serif' } mpl.rcdefaults() mpl.rcParams.update(params) fig = plt.figure(1) ax = fig.add_axes([0.15, 0.15, 0.8, 0.8]) # main axes ax.xaxis.labelpad = -12 ax.yaxis.labelpad = -12 # A: Draw directed degree distribution y_vec = [] for i in range(len(d_out_vec)): y = np.repeat(list(d_out_vec[i].keys()), list(d_out_vec[i].values()) ) # !!! np.repeat y = -np.sort(-y) y_vec.append(y) # print ("Class {}:\n{}".format(i,y)) y_tot = np.repeat(list(d_out_vec_tot.keys()), list(d_out_vec_tot.values())) # total outdegree y_tot = -np.sort(-y_tot) plt.loglog(range(1, len(y_vec[0])+1), y_vec[0], lw=4, color='orange', label=r"A out", linestyle='-') # !!! plot default index starts from 0 otherwise plt.loglog(range(1, len(y_vec[1])+1), y_vec[1], lw=4, color='blue', label=r"B out", linestyle='--') plt.loglog(range(1, len(y_vec[2])+1), y_vec[2], lw=4, color='green', label=r"C out", linestyle=':') plt.loglog(range(1, len(y_tot)+1), y_tot, lw=1, color='black', label=r"tot out", linestyle='-') # B: Draw second edge distribution of undirected degree distribution y_vec = [] for i in range(len(d_vec_und)): y = np.repeat(list(d_vec_und[i].keys()), list(d_vec_und[i].values()) ) # !!! np.repeat y = -np.sort(-y) y_vec.append(y) # print ("Class {}:\n{}".format(i,y)) y_tot = np.repeat(list(d_tot_vec_tot.keys()), list(d_tot_vec_tot.values())) # total outdegree y_tot = -np.sort(-y_tot) plt.loglog(range(1, len(y_vec[0])+1), y_vec[0], lw=4, color='orange', label=r"A", linestyle='-') plt.loglog(range(1, len(y_vec[1])+1), y_vec[1], lw=4, color='blue', label=r"B", linestyle='--') plt.loglog(range(1, len(y_vec[2])+1), y_vec[2], lw=4, color='green', label=r"C", linestyle=':') plt.loglog(range(1, len(y_tot)+1), y_tot, lw=1, color='black', label=r"tot", linestyle='-') plt.legend(loc='upper right', labelspacing=0) filename = 'figs/Fig_test_planted_distribution_model1_{}.pdf'.format(CHOICE) plt.savefig(filename, dpi=None, facecolor='w', edgecolor='w', orientation='portrait', papertype='letter', format='pdf', transparent=True, bbox_inches='tight', pad_inches=0.1, # frameon=None, # TODO: frameon deprecated ) os.system("open " + filename) # --- Fig2: Draw block matrix print("\n- Fig2: 'create_blocked_matrix_from_graph'") W_new, Xd_new = create_blocked_matrix_from_graph(W, Xd) fig = plt.figure(2) row, col = W_new.nonzero() # transform the sparse W back to row col format plt.plot(col, row, 'o', color='r', markersize=2, markeredgewidth=2, lw=0, zorder=3) # Notice (col, row) because first axis is vertical in matrices # plt.matshow(W_new.todense(), cmap=plt.cm.Greys) # cmap=plt.cm.gray / Blues # alternative that does not work as well plt.gca().invert_yaxis() # invert the y-axis to start on top and go down # Show quadrants d1 = alpha0[0] * n d2 = (alpha0[0] + alpha0[1]) * n plt.grid(which='major', color='0.7', linestyle='-', linewidth=1) plt.xticks([0, d1, d2, n]) plt.yticks([0, d1, d2, n]) plt.xlabel('to', labelpad=-1) plt.ylabel('from', rotation=90, labelpad=0) frame = plt.gca() # frame.axes.xaxis.set_ticklabels([]) # would hide the labels # frame.axes.yaxis.set_ticklabels([]) frame.tick_params(direction='inout', width=1, length=10) filename = 'figs/Fig_test_planted_distribution_model2_{}.pdf'.format(CHOICE) plt.savefig(filename, dpi=None, facecolor='w', edgecolor='w', orientation='portrait', papertype='letter', format='pdf', transparent=True, bbox_inches='tight', pad_inches=0.1) os.system("open " + filename)
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False): # -- Setup CHOICE = choice #300 Prop37, 400 MovieLens, 500 Yelp, 600 Flickr, 700 DBLP, 800 Enron experiments = [CHOICE] CREATE_DATA = create_data ADD_DATA = add_data SHOW_PDF = show_pdf SHOW_PLOT = show_plot CREATE_PDF = create_pdf SHOW_FIG = SHOW_PLOT or SHOW_PDF or CREATE_PDF STD_FILL = True TIMING = False CALCULATE_DATA_STATISTICS = False # -- Default Graph parameters rep_SameGraph = 10 # iterations on same graph initial_h0 = None # initial vector to start finding optimal H exponent = -0.3 length = 5 variant = 1 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 clip_on_vec = [True] * 10 numMaxIt_vec = [10] * 10 # Plotting Parameters xtick_lab = [0.001, 0.01, 0.1, 1] xtick_labels = ['0.1\%', '1\%', '10\%', '100\%'] ytick_lab = np.arange(0, 1.1, 0.1) xmax = 1 xmin = 0.0001 ymin = 0.3 ymax = 0.7 labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [False] * 4 + [True] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] Macro_Accuracy = False EC = True # Non-backtracking for learning constraints = True # True weight_vec = [None] * 3 + [10, 10] * 2 randomize_vec = [False] * 4 + [True] * 2 k = 3 err = 0 avoidNeighbors = False convergencePercentage_W = None stratified = True gradient = True doubly_stochastic = True num_restarts = None raw_std_vec = range(10) numberOfSplits = 1 select_lambda_vec = [False] * 20 lambda_vec = None f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] FILENAMEZ = "" legend_location = "" fig_label = "" H_heuristic = "" def choose(choice): # -- Default Graph parameters nonlocal n nonlocal d nonlocal rep_SameGraph nonlocal FILENAMEZ nonlocal initial_h0 nonlocal exponent nonlocal length nonlocal variant nonlocal alpha_vec nonlocal beta_vec nonlocal gamma_vec nonlocal s_vec nonlocal clip_on_vec nonlocal numMaxIt_vec # Plotting Parameters nonlocal xtick_lab nonlocal xtick_labels nonlocal ytick_lab nonlocal xmax nonlocal xmin nonlocal ymin nonlocal ymax nonlocal labels nonlocal facecolor_vec nonlocal draw_std_vec nonlocal linestyle_vec nonlocal linewidth_vec nonlocal marker_vec nonlocal markersize_vec nonlocal legend_location nonlocal option_vec nonlocal learning_method_vec nonlocal Macro_Accuracy nonlocal EC nonlocal constraints nonlocal weight_vec nonlocal randomize_vec nonlocal k nonlocal err nonlocal avoidNeighbors nonlocal convergencePercentage_W nonlocal stratified nonlocal gradient nonlocal doubly_stochastic nonlocal num_restarts nonlocal numberOfSplits nonlocal H_heuristic nonlocal select_lambda_vec nonlocal lambda_vec nonlocal f_vec if choice == 0: None elif choice == 304: ## with varying weights FILENAMEZ = 'prop37' Macro_Accuracy = True gradient = True fig_label = 'Prop37' legend_location = 'lower right' n = 62000 d = 34.8 select_lambda_vec = [False] * 5 f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] elif choice == 305: # DCEr Only experiment choose(605) choose(304) select_lambda_vec = [False] * 6 elif choice == 306: choose(304) select_lambda_vec = [False] * 3 + [True] * 3 lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec learning_method_vec.append('Holdout') labels.append('Holdout') elif choice == 307: # heuristic comparison choose(304) select_lambda_vec = [False] * 3 + [True] * 3 lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec learning_method_vec.append('Heuristic') labels.append('Heuristic') H_heuristic = np.array([[.476, .0476, .476], [.476, .0476, .476], [.476, .476, .0476]]) # -- MovieLens dataset elif choice == 401: FILENAMEZ = 'movielens' Macro_Accuracy = True gradient = True fig_label = 'MovieLens' legend_location = 'upper left' n = 26850 d = 25.0832029795 elif choice == 402: choose(401) select_lambda_vec = [False] * 3 + [ True ] * 3 # allow to choose lambda for different f in f_vec lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 403: choose(402) ymin = 0.3 ymax = 1.0 learning_method_vec.append('Holdout') labels.append('Holdout') elif choice == 404: choose(401) select_lambda_vec = [ True ] * 3 # allow to choose lambda for different f in f_vec lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec labels = ['GS', 'DCEr', 'Homophily'] facecolor_vec = ['black', "#C44E52", "#64B5CD"] draw_std_vec = [False, True, False] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 2, 2, 2, 2] marker_vec = [None, '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] weight_vec = [None, 10, None] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] randomize_vec = [False, True, False] learning_method_vec = ['GT', 'DHE'] #TODO elif choice == 405: # DCEr ONLY experiment choose(605) choose(401) learning_method_vec += ['Holdout'] labels += ['Holdout'] elif choice == 406: # comparison with a static heuristic matrix choose(402) learning_method_vec += ['Heuristic'] labels += ['Heuristic'] H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476], [.476, .476, .0476]]) elif choice == 407: choose(402) ymin = 0.3 ymax = 1.0 lambda_vec = [1] * 21 # same length as f_vec elif choice == 408: choose(402) ymin = 0.3 ymax = 1.0 lambda_vec = [10] * 21 # same length as f_vec # DO NOT RUN WITH CREATE_DATA=True, if you do please restore the data from # data/sigmod-movielens-fig.csv elif choice == 409: choose(402) facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#8172B2", "#C44E52", "#C44E52", "#CCB974", "#64B5CD" ] labels = [ 'GS', 'LCE', 'MCE', 'DCE1', 'DCE10', 'DCEr1', 'DCEr10', 'Holdout' ] draw_std_vec = [False] * 5 + [True] * 2 + [False] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [2, 2, 2, 2, 2, 2, 2, 2] marker_vec = [None, 'o', 'x', 's', 'p', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8, 8] option_vec = [ 'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8' ] legend_location = 'upper left' ymin = 0.3 ymax = 1.0 lambda_vec = [10] * 21 # same length as f_vec # -- Yelp dataset elif choice == 501: FILENAMEZ = 'yelp' Macro_Accuracy = True weight_vec = [None] * 3 + [10, 10] gradient = True ymin = 0.1 ymax = 0.75 fig_label = 'Yelp' legend_location = 'upper left' n = 4301900 # for figure d = 6.56 # for figure # -- Flickr dataset elif choice == 601: FILENAMEZ = 'flickr' Macro_Accuracy = True fig_label = 'Flickr' legend_location = 'lower right' ymin = 0.3 ymax = 0.7 n = 2007369 d = 18.1 elif choice == 602: ## with varying weights choose(601) select_lambda_vec = [False] * 4 + [ True ] * 2 # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 603: ## with varying weights choose(602) select_lambda_vec = [False] * 3 + [ True ] * 2 # allow to choose lambda for different f in f_vec # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6 # same length as f_vec elif choice == 604: ## with weight = 1 choose(603) lambda_vec = [0.5] * 21 # same length as f_vec elif choice == 605: choose(601) facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD", 'orange' ] draw_std_vec = [False] + [True] * 10 linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [3] * 10 marker_vec = [None, 'o', 'x', '^', 'v', '+', 'o', 'x'] markersize_vec = [0] + [8] * 10 randomize_vec = [True] * 8 option_vec = [ 'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8' ] learning_method_vec = [ 'GT', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE' ] select_lambda_vec = [False] * 8 f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec weight_vec = [0, 0, 1, 2, 5, 10, 15] labels = ['GT'] + [ i + ' {}'.format(weight_vec[ix]) for ix, i in enumerate(['DCEr'] * 6) ] elif choice == 606: # heuristic experiment choose(602) labels.append('Heuristic') learning_method_vec.append('Heuristic') H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476], [.476, .476, .0476]]) # -- DBLP dataset elif choice == 701: FILENAMEZ = 'dblp' Macro_Accuracy = True ymin = 0.2 ymax = 0.5 fig_label = 'DBLP' legend_location = 'lower right' n = 2241258 # for figure d = 26.11 # for figure # -- ENRON dataset elif choice == 801: FILENAMEZ = 'enron' Macro_Accuracy = True ymin = 0.3 ymax = 0.75 fig_label = 'Enron' f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] legend_location = 'upper left' n = 46463 # for figures d = 23.4 # for figures elif choice == 802: ### WITH ADAPTIVE WEIGHTS choose(801) select_lambda_vec = [False] * 4 + [ True ] * 2 # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 803: ### WITH ADAPTIVE WEIGHTS choose(802) lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [ 1 ] * 6 # same length as f_vec elif choice == 804: choose(803) elif choice == 805: choose(605) choose(801) #learning_method_vec += ['Holdout'] #labels += ['Holdout'] elif choice == 806: # Heuristic experiment choose(802) learning_method_vec += ['Heuristic'] labels += ['Heuristic'] H_heuristic = np.array([[0.76, 0.08, 0.08, 0.08], [0.08, 0.08, 0.76, 0.08], [0.08, 0.76, 0.08, 0.76], [0.08, 0.08, 0.76, 0.08]]) # MASC Dataset elif choice == 901: FILENAMEZ = 'masc' Macro_Accuracy = False fig_label = 'MASC' legend_location = 'lower right' n = 0 d = 0 ymin = 0 num_restarts = 100 select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec # MASC collapsed Dataset elif choice == 1001: FILENAMEZ = 'masc-collapsed' fig_label = 'MASC Collapsed' legend_location = 'lower right' n = 43724 d = 7.2 ymin = 0 num_restarts = 20 select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 1002: choose(1001) Macro_Accuracy = True # MASC Reduced dataset elif choice == 1101: FILENAMEZ = 'masc-reduced' fig_label = 'MASC Reduced' legend_location = 'lower right' n = 31000 d = 8.3 ymin = 0 select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 1102: choose(1101) Macro_Accuracy = True else: raise Warning("Incorrect choice!") def _f_worker_(X0, W, f, f_index): RANDOMSEED = None # For repeatability random.seed(RANDOMSEED) # seeds some other python random generator np.random.seed( seed=RANDOMSEED ) # seeds the actually used numpy random generator; both are used and thus needed X1, ind = replace_fraction_of_rows(X0, 1 - f, avoidNeighbors=avoidNeighbors, W=W, stratified=stratified) X2 = introduce_errors(X1, ind, err) for option_index, (label, select_lambda, learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \ enumerate(zip(labels, select_lambda_vec, learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)): learn_time = -1 # -- Learning if learning_method == 'GT': H2c = H0c elif learning_method == 'Heuristic': # print('Heuristic') H2c = H_heuristic elif learning_method == 'Holdout': # print('Holdout') H2 = estimateH_baseline_serial( X2, ind, W, numMax=numMaxIt, # ignore_rows=ind, numberOfSplits=numberOfSplits, # method=learning_method, variant=1, # distance=length, EC=EC, alpha=alpha, beta=beta, gamma=gamma, doubly_stochastic=doubly_stochastic) H2c = to_centering_beliefs(H2) else: if "DCEr" in learning_method: learning_method = "DCEr" elif "DCE" in learning_method: learning_method = "DCE" # -- choose optimal lambda: allows to specify different lambda for different f # print("option: ", option_index) if select_lambda == True: weight = lambda_vec[f_index] # print("weight : ", weight) else: weight = weights # -- learn H learn_start = time.time() H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weight, randomrestarts=num_restarts, randomize=randomize, constraints=constraints, gradient=gradient, doubly_stochastic=doubly_stochastic) learn_time = time.time() - learn_start H2c = to_centering_beliefs(H2) # if learning_method not in ['GT', 'GS']: # print(FILENAMEZ, f, learning_method) # print(H2c) # -- Propagation prop_start = time.time() # X2c = to_centering_beliefs(X2, ignoreZeroRows=True) # try without eps_max = eps_convergence_linbp_parameterized(H2c, W, method='noecho', alpha=alpha, beta=beta, gamma=gamma, X=X2) eps = s * eps_max # print("Max eps: {}, eps: {}".format(eps_max, eps)) # eps = 1 try: F, actualIt, actualPercentageConverged = \ linBP_symmetric_parameterized(X2, W, H2c * eps, method='noecho', alpha=alpha, beta=beta, gamma=gamma, numMaxIt=numMaxIt, convergencePercentage=convergencePercentage_W, debug=2) prop_time = time.time() - prop_start if Macro_Accuracy: accuracy_X = matrix_difference_classwise(X0, F, ignore_rows=ind) precision = matrix_difference_classwise( X0, F, similarity='precision', ignore_rows=ind) recall = matrix_difference_classwise(X0, F, similarity='recall', ignore_rows=ind) else: accuracy_X = matrix_difference(X0, F, ignore_rows=ind) precision = matrix_difference(X0, F, similarity='precision', ignore_rows=ind) recall = matrix_difference(X0, F, similarity='recall', ignore_rows=ind) result = [str(datetime.datetime.now())] text = [ label, f, accuracy_X, precision, recall, learn_time, prop_time ] result.extend(text) # print("method: {}, f: {}, actualIt: {}, accuracy: {}, precision:{}, recall: {}, learning time: {}, propagation time: {}".format(label, f, actualIt, accuracy_X, precision, recall, learn_time, prop_time)) save_csv_record(join(data_directory, csv_filename), result) except ValueError as e: print("ERROR: {} with {}: d={}, h={}".format( e, learning_method, d, h)) raise e return 'success' def multi_run_wrapper(args): """Wrapper to unpack arguments passed to the pool worker. NOTE: This method could be removed by upgrading to Python>=3.3, which includes the multiprocessing.starmap_async() function, which allows multiple arguments to be passed to the map function. """ return _f_worker_(*args) for choice in experiments: choose(choice) filename = 'Fig_End-to-End_accuracy_realData_{}_{}'.format( choice, FILENAMEZ) csv_filename = '{}.csv'.format(filename) header = [ 'currenttime', 'method', 'f', 'accuracy', 'precision', 'recall', 'learntime', 'proptime' ] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # print("choice: {}".format(choice)) # --- print data statistics if CALCULATE_DATA_STATISTICS: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) d = (len(W.nonzero()[0]) * 2) / n k = len(X0[0]) print("FILENAMEZ:", FILENAMEZ) print("k:", k) print("n:", n) print("d:", d) # -- Graph statistics n_vec = calculate_nVec_from_Xd(Xd) print("n_vec:\n", n_vec) d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd) print("d_vec:\n", d_vec) P = calculate_Ptot_from_graph(W, Xd) print("P:\n", P) for i in range(k): Phi = calculate_degree_correlation(W, X0, i, NB=True) print("Degree Correlation, Class {}:\n{}".format(i, Phi)) # -- Various compatibilities H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) print("H0 w/ constraints:\n", np.round(H0, 2)) #raw_input() # Why? H2 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H4 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H5 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H6 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H7 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) print() # print("H MCE w/o constraints:\n", np.round(H0, 3)) print("H MCE w/ constraints:\n", np.round(H2, 3)) # print("H DCE 2 w/o constraints:\n", np.round(H4, 3)) print("H DCE 2 w/ constraints:\n", np.round(H5, 3)) # print("H DCE 10 w/o constraints:\n", np.round(H6, 3)) print("H DCE 20 w/ constraints:\n", np.round(H7, 3)) print() H_row_vec = H_observed(W, X0, 3, NB=True, variant=1) print("H_est_1:\n", np.round(H_row_vec[0], 3)) print("H_est_2:\n", np.round(H_row_vec[1], 3)) print("H_est_3:\n", np.round(H_row_vec[2], 3)) # --- Create data if CREATE_DATA or ADD_DATA: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) ## number of nodes in graph k = len(X0[0]) d = (len(W.nonzero()[0]) * 2) / n #print(n) #print(d) #print("contraint = {}".format(constraints)) #print('select lambda: {}'.format(len(select_lambda_vec))) #print('learning method: {}'.format(len(learning_method_vec))) #print('alpha: {}'.format(len(alpha_vec))) #print('beta: {}'.format(len(beta_vec))) #print('gamma: {}'.format(len(gamma_vec))) #print('s: {}'.format(len(s_vec))) #print('maxit: {}'.format(len(numMaxIt_vec))) #print('weight: {}'.format(len(weight_vec))) #print('randomize: {}'.format(len(randomize_vec))) # --- Calculating True Compatibility matrix H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=constraints, gradient=gradient, doubly_stochastic=doubly_stochastic) # print(H0) H0c = to_centering_beliefs(H0) num_results = len(f_vec) * len(learning_method_vec) * rep_SameGraph # Starts a thread pool with 10 fewer than the max number your computer # has available assuming one thread per cpu - this is meant for # supercomputer. #pool = multiprocessing.Pool(int(multiprocessing.cpu_count()-10)) # Use this for a reasonably powerful home computer #pool = multiprocessing.Pool(int(multiprocessing.cpu_count()/2)) # Use this for anything else pool = multiprocessing.Pool(2) f_processes = f_vec * rep_SameGraph workers = [] results = [(X0, W, f, ix) for ix, f in enumerate(f_vec)] * rep_SameGraph # print('Expected results: {}'.format(num_results)) try: # hacky fix due to a bug in 2.7 multiprocessing # Distribute work for evaluating accuracy over the thread pool using # a hacky method due to python 2.7 multiprocessing not being fully # featured pool.map_async(multi_run_wrapper, results).get(num_results * 2) except multiprocessing.TimeoutError as e: continue finally: pool.close() pool.join() # -- Read data for all options and plot df1 = pd.read_csv(join(data_directory, csv_filename)) acc_filename = '{}_accuracy_plot.pdf'.format(filename) pr_filename = '{}_PR_plot.pdf'.format(filename) if TIMING: print('=== {} Timing Results ==='.format(FILENAMEZ)) print('Prop Time:\navg: {}\nstddev: {}'.format( np.average(df1['proptime'].values), np.std(df1['proptime'].values))) for learning_method in labels: rs = df1.loc[df1["method"] == learning_method] avg = np.average(rs['learntime']) std = np.std(rs['learntime']) print('{} Learn Time:\navg: {}\nstd: {}'.format( learning_method, avg, std)) sslhv.plot(df1, join(figure_directory, acc_filename), n=n, d=d, k=k, labels=labels, dataset=FILENAMEZ, line_styles=linestyle_vec, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, marker_sizes=markersize_vec, draw_stds=draw_std_vec, markers=marker_vec, line_colors=facecolor_vec, line_widths=linewidth_vec, legend_location=legend_location, show=SHOW_PDF, save=CREATE_PDF, show_plot=SHOW_PLOT)
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False): verbose = False repeat_diffGraph = 1000 SUBSET = True NOGT = False ## Not draw Ground Truth Comparison CHOICE = choice CREATE_DATA = create_data ADD_DATA = add_data SHOW_PLOT = show_plot SHOW_PDF = show_pdf CREATE_PDF = create_pdf STD_FILL = False csv_filename = 'Fig_fast_optimal_restarts_Accv2_{}.csv'.format(CHOICE) fig_filename = 'Fig_fast_optimal_restarts_Accv2_{}.pdf'.format(CHOICE) header = ['currenttime', 'k', 'restarts', 'accuracy'] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # -- Default Graph parameters global f_vec, labels, facecolor_vec global number_of_restarts initial_h0 = None distribution = 'powerlaw' exponent = -0.3 # for powerlaw length = 4 # path length constraint = True gradient = True variant = 1 EC = True delta = 0.001 numMaxIt = 10 avoidNeighbors = False convergencePercentage_W = None stratified = True learning_method = 'DHE' weights = 10 randomize = True return_min_energy = True number_of_restarts = [8, 6, 5, 4] clip_on_vec = [True] * 20 draw_std_vec = range(10) ymin = 0.3 ymax = 1 xmin = 0.001 xmax = 1 xtick_lab = [] xtick_labels = [] ytick_lab = np.arange(0, 1.1, 0.1) linestyle_vec = ['solid','solid','solid'] * 20 linewidth_vec = [4,4,4,4]*10 marker_vec = ['x', 'v', '^', '+', '>', '<'] *10 markersize_vec = [10, 8, 8, 8 ,8 ,8 ,8 ]*10 facecolor_vec = ["#C44E52", "#4C72B0", "#8172B2", "#CCB974", "#55A868", "#64B5CD"]*5 # -- Options mainly change k if CHOICE == 101: n = 10000 h = 3 d = 15 k_vec = [3, 4, 5, 6, 7, 10, 13, 16, 18, 20] # k_vec = [4, 5, 7, 10] f = 0.09 distribution = 'uniform' # Write in DESCENDING ORDER number_of_restarts = [30, 20, 10, 7, 5, 4, 3, 2, 1, 50, 99, 100] ### 100:GT 99:GTr ### 50:min{30,GTr} 1:uninformative labels = ['r' + str(a1) for a1 in number_of_restarts] xtick_lab = k_vec xtick_labels = [str(a1) for a1 in k_vec] elif CHOICE == 102: n = 10000 h = 3 d = 15 k_vec = [3, 4, 5, 6, 7, 8] # k_vec = [4, 5, 7, 10] f = 0.09 distribution = 'uniform' # Write in DESCENDING ORDER # number_of_restarts = [30, 20, 10, 7, 5, 4, 3, 2, 1, 50, 99, 100] number_of_restarts = [20, 10, 5, 4, 3, 2] ### 100:GT 99:GTr ### 50:min{30,GTr} 1:uninformative labels = ['r' + str(a1) for a1 in number_of_restarts] xtick_lab = k_vec xtick_labels = [str(a1) for a1 in k_vec] elif CHOICE == 103: n = 10000 h = 3 d = 15 k_vec = [3, 4, 5, 6, 7, 8] # k_vec = [4, 5, 7, 10] f = 0.09 distribution = 'uniform' # Write in DESCENDING ORDER number_of_restarts = [20, 10, 5, 4, 3, 2, 99] ### 100:GT 99:GTr ### 50:min{30,GTr} 1:uninformative marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10 markersize_vec = [6, 10, 6, 6, 10, 6] * 10 labels = ['r' + str(a1) for a1 in number_of_restarts] xtick_lab = k_vec xtick_labels = [str(a1) for a1 in k_vec] elif CHOICE == 104: n = 10000 h = 8 d = 15 k_vec = [3, 4, 5, 6, 7, 8] # k_vec = [4, 5, 7, 10] f = 0.09 distribution = 'uniform' # Write in DESCENDING ORDER number_of_restarts = [20, 10, 5, 4, 3, 2, 99] ### 100:GT 99:GTr ### 50:min{30,GTr} 1:uninformative marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10 markersize_vec = [6, 10, 6, 6, 10, 6] * 10 labels = ['r' + str(a1) for a1 in number_of_restarts] xtick_lab = k_vec xtick_labels = [str(a1) for a1 in k_vec] elif CHOICE == 105: n = 10000 h = 8 d = 15 k_vec = [3, 4, 5, 6, 7, 8] # k_vec = [4, 5, 7, 10] f = 0.09 distribution = 'uniform' # Write in DESCENDING ORDER number_of_restarts = [20, 10, 5, 4, 3, 2, 100] ### 100:GT 99:GTr ### 50:min{30,GTr} 1:uninformative marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10 markersize_vec = [6, 10, 6, 6, 10, 6] * 10 labels = ['r' + str(a1) for a1 in number_of_restarts] xtick_lab = k_vec xtick_labels = [str(a1) for a1 in k_vec] elif CHOICE == 106: n = 10000 h = 3 d = 15 k_vec = [3, 4, 5, 6, 7, 8] # k_vec = [4, 5, 7, 10] f = 0.09 distribution = 'uniform' # Write in DESCENDING ORDER number_of_restarts = [20, 10, 5, 4, 3, 2, 100] ### 100:GT 99:GTr ### 50:min{30,GTr} 1:uninformative marker_vec = ['o', 'x', 'v', '^', '+', 's', None] * 10 markersize_vec = [6, 10, 6, 6, 10, 6] * 10 labels = ['r' + str(a1) for a1 in number_of_restarts] xtick_lab = k_vec xtick_labels = [str(a1) for a1 in k_vec] elif CHOICE == 107: n = 10000 h = 8 d = 15 k_vec = [2, 3, 4, 5, 6, 7, 8] # k_vec = [4, 5, 7, 10] f = 0.09 distribution = 'uniform' # Write in DESCENDING ORDER number_of_restarts = [10, 5, 4, 3, 2, 99] # number_of_restarts = [20, 10, 5, 4, 3, 2, 100] ### 100:GT 99:GTr ### 50:min{30,GTr} 1:uninformative marker_vec = ['x', 'v', '^', 's', 'o', 's', None] * 10 markersize_vec = [10, 6, 6, 6, 6, 6, 6] * 10 labels = [r'$r=$' + str(a1) for a1 in number_of_restarts] xtick_lab = k_vec xtick_labels = [str(a1) for a1 in k_vec] elif CHOICE == 108: n = 10000 h = 8 d = 15 k_vec = [2, 3, 4, 5, 6, 7, 8] # k_vec = [4, 5, 7, 10] f = 0.09 distribution = 'uniform' # Write in DESCENDING ORDER number_of_restarts = [10, 5, 4, 3, 2, 99] # number_of_restarts = [20, 10, 5, 4, 3, 2, 100] ### 100:GT 99:GTr ### 50:min{30,GTr} 1:uninformative marker_vec = ['x', 'v', '^', 's', 'o', 's', None] * 10 markersize_vec = [10, 6, 6, 6, 6, 6, 6] * 10 labels = [r'$r=$' + str(a1) for a1 in number_of_restarts] xtick_lab = k_vec xtick_labels = [str(a1) for a1 in k_vec] repeat_diffGraph = 10 else: raise Warning("Incorrect choice!") RANDOMSEED = None # For repeatability random.seed(RANDOMSEED) # seeds some other python random generator np.random.seed(seed=RANDOMSEED) # seeds the actually used numpy random generator; both are used and thus needed # print("CHOICE: {}".format(CHOICE)) # -- Create data if CREATE_DATA or ADD_DATA: for _ in range(repeat_diffGraph): for k in k_vec: a = [1.] * k k_star = int(k * (k - 1) / 2) alpha0 = np.array(a) alpha0 = alpha0 / np.sum(alpha0) # Generate Graph # print("Generating Graph: n={} h={} d={} k={}".format(n, h, d, k)) H0 = create_parameterized_H(k, h, symmetric=True) W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d, distribution=distribution, exponent=exponent, directed=False, debug=False) H0_vec = transform_HToh(H0) # print("\nGold standard {}".format(np.round(H0_vec, decimals=3))) X0 = from_dictionary_beliefs(Xd) X2, ind = replace_fraction_of_rows(X0, 1 - f, avoidNeighbors=avoidNeighbors, W=W, ind_prior=None, stratified=stratified) h0 = [1.] * int(k_star) h0 = np.array(h0) h0 = h0 / k delta = 1 / (3 * k) # print("delta: ", delta) perm = [] while len(perm) < number_of_restarts[0]: temp = [] for _ in range(k_star): temp.append(random.choice([-delta, delta])) if temp not in perm: perm.append(temp) if len(perm) >= 2 ** (k_star): break E_list = [] ## format = [[energy, H_vec], []..] for vec in perm: H2_vec, energy = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=False, constraints=constraint, gradient=gradient, return_min_energy=True, verbose=verbose, initial_h0=h0 + np.array(vec)) E_list.append([energy, list(H2_vec)]) # print("All Optimizaed vector:") # [print(i) for i in E_list ] # print("Outside Energy:{} optimized vec:{} \n".format(min_energy_vec[0], optimized_Hvec)) # min_energy_vec = min(E_list) # optimized_Hvec = min_energy_vec[1] # # print("\nEnergy:{} optimized vec:{} \n\n".format(min_energy_vec[0],optimized_Hvec)) # # GTr_optimized_Hvec, GTr_energy = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=False, constraints=constraint, gradient=gradient, return_min_energy=True, verbose=verbose, initial_h0=H0_vec) uninformative_optimized_Hvec, uninformative_energy = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weights, randomize=False, constraints=constraint, gradient=gradient, return_min_energy=True, verbose=verbose, initial_h0=h0) iterative_permutations = list(E_list) for restartz in number_of_restarts: if k==2 or k == 3 and restartz > 8 and restartz<99: continue if restartz <= number_of_restarts[0]: iterative_permutations = random.sample(iterative_permutations, restartz) # print("For restart:{}, we have vectors:\n".format(restartz)) # [print(i) for i in iterative_permutations] if restartz == 100: ## for GT H2c = to_centering_beliefs(H0) # print("\nGT: ", transform_HToh(H0,k)) elif restartz == 99: ## for DCEr init with GT H2c = to_centering_beliefs(transform_hToH(GTr_optimized_Hvec, k)) # print("\nGTr: ", GTr_optimized_Hvec) elif restartz == 1: ## for DCEr with uninformative initial H2c = to_centering_beliefs(transform_hToH(uninformative_optimized_Hvec, k)) # print("\nUninformative: ", uninformative_optimized_Hvec) elif restartz == 50: ## for min{DCEr , GTr} # print("Length:",len(E_list)) # [print(i) for i in E_list] mod_E_list = list(E_list)+[[GTr_energy , list(GTr_optimized_Hvec)]] #Add GTr to list and take min # print("Mod Length:", len(mod_E_list)) # [print(i) for i in mod_E_list] min_energy_vec = min(mod_E_list) # print("\nSelected for 50:",min_energy_vec) optimized_Hvec = min_energy_vec[1] H2c = to_centering_beliefs(transform_hToH(optimized_Hvec, k)) else: min_energy_vec = min(iterative_permutations) optimized_Hvec = min_energy_vec[1] H2c = to_centering_beliefs(transform_hToH(optimized_Hvec, k)) # print("Inside Chosen Energy:{} optimized vec:{} \n".format(min_energy_vec[0], optimized_Hvec)) try: eps_max = eps_convergence_linbp_parameterized(H2c, W, method='noecho', X=X2) s = 0.5 eps = s * eps_max F, actualIt, actualPercentageConverged = \ linBP_symmetric_parameterized(X2, W, H2c * eps, method='noecho', numMaxIt=numMaxIt, convergencePercentage=convergencePercentage_W, debug=2) except ValueError as e: print( "ERROR: {} with {}: d={}, h={}".format(e, learning_method, d, h)) else: acc = matrix_difference_classwise(X0, F, ignore_rows=ind) tuple = [str(datetime.datetime.now())] text = [k, restartz, acc] tuple.extend(text) if verbose: print("\nGold standard {}".format(np.round(H0_vec, decimals=3))) # print("k:{} Restart:{} OptimizedVec:{} Energy:{} Accuracy:{}".format(k, restartz, np.round(min_energy_vec[1], decimals=3), min_energy_vec[0], acc )) # print("k:{} Restart:{} Accuracy:{}".format(k, 1, L2_dist)) save_csv_record(join(data_directory, csv_filename), tuple) # -- Read, aggregate, and pivot data for all options df1 = pd.read_csv(join(data_directory, csv_filename)) # print("\n-- df1 (length {}):\n{}".format(len(df1.index), df1.head(20))) # Aggregate repetitions df2 = df1.groupby(['k', 'restarts']).agg \ ({'accuracy': [np.mean, np.std, np.size], }) df2.columns = ['_'.join(col).strip() for col in df2.columns.values] # flatten the column hierarchy df2.reset_index(inplace=True) # remove the index hierarchy df2.rename(columns={'accuracy_size': 'count'}, inplace=True) df2['restarts'] = df2['restarts'].astype(str) # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(20))) # Pivot table df3 = pd.pivot_table(df2, index=['k'], columns=['restarts'], values=['accuracy_mean', 'accuracy_std'] ) # Pivot # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30))) df3.columns = ['_'.join(col).strip() for col in df3.columns.values] # flatten the column hierarchy df3.reset_index(inplace=True) # remove the index hierarchy # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(10))) df4 = df3.drop('k', axis=1) if NOGT: df4 = df3.drop(['k', 'accuracy_mean_0', 'accuracy_mean_1', 'accuracy_std_0', 'accuracy_std_1'], axis=1) # df4 = df3.drop(['k', 'accuracy_mean_100', 'accuracy_std_100'], axis=1) df5 = df4.div(df4.max(axis=1), axis=0) df5['k'] = df3['k'] # print("\n-- df5 (length {}):\n{}".format(len(df5.index), df5.head(100))) # df5 = df3 ## for normalization X_f = df5['k'].values # read k from values instead Y=[] Y_std=[] for rez in number_of_restarts: if NOGT: if rez == 100 or rez==99: continue Y.append(df5['accuracy_mean_{}'.format(rez)].values) if STD_FILL: Y_std.append(df5['accuracy_std_{}'.format(rez)].values) if CREATE_PDF or SHOW_PDF or SHOW_PLOT: # -- Setup figure mpl.rc('font', **{'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans']}) mpl.rcParams['axes.labelsize'] = 20 mpl.rcParams['xtick.labelsize'] = 16 mpl.rcParams['ytick.labelsize'] = 16 mpl.rcParams['legend.fontsize'] = 14 mpl.rcParams['grid.color'] = '777777' # grid color mpl.rcParams['xtick.major.pad'] = 2 # padding of tick labels: default = 4 mpl.rcParams['ytick.major.pad'] = 1 # padding of tick labels: default = 4 mpl.rcParams['xtick.direction'] = 'out' # default: 'in' mpl.rcParams['ytick.direction'] = 'out' # default: 'in' mpl.rcParams['font.size'] = 16 mpl.rcParams['axes.titlesize'] = 16 mpl.rcParams['figure.figsize'] = [4, 4] fig = figure() ax = fig.add_axes([0.13, 0.17, 0.8, 0.8]) # -- Drawing if STD_FILL: for choice, (option, facecolor) in enumerate(zip(number_of_restarts, facecolor_vec)): if option == 100: ## GT if NOGT: continue facecolor = 'black' elif option == 99: ## GT-r if NOGT: continue facecolor = 'black' ax.fill_between(X_f, Y[choice] + Y_std[choice], Y[choice] - Y_std[choice], facecolor=facecolor, alpha=0.2, edgecolor=None, linewidth=0) ax.plot(X_f, Y[choice] + Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid') ax.plot(X_f, Y[choice] - Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid') for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \ enumerate(zip(number_of_restarts, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)): if option == 100: ## GT if NOGT: continue linestyle='dashed' linewidth=3 color='black' label='GS' marker='x' markersize=6 elif option == 99: ## GT-r if NOGT: continue linestyle='dashed' linewidth=2 color='black' label='Global Minima' marker = None markersize = 6 elif option == 1: ## GT color="#CCB974" linewidth = 2 label='Uninfo' elif option == 50: ## GT-r label='min{30,GTr}' P = ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker, markersize=markersize, markeredgecolor='black', markeredgewidth=1, clip_on=clip_on) # plt.xscale('log') # -- Title and legend distribution_label = '$' if distribution == 'uniform': distribution_label = ',$uniform' n_label = '{}k'.format(int(n / 1000)) if n < 1000: n_label='{}'.format(n) titleString = r'$\!\!\!n\!=\!{}, d\!=\!{}, h\!=\!{}, f\!=\!{} $'.format(n_label, d, h, f) title(titleString) handles, labels = ax.get_legend_handles_labels() legend = plt.legend(handles, labels, loc='lower left', # 'upper right' handlelength=2, labelspacing=0, # distance between label entries handletextpad=0.3, # distance between label and the line representation borderaxespad=0.2, # distance between legend and the outer axes borderpad=0.3, # padding inside legend box numpoints=1, # put the marker only once # bbox_to_anchor=(1.1, 0) ) # # legend.set_zorder(1) frame = legend.get_frame() frame.set_linewidth(0.0) frame.set_alpha(0.9) # 0.8 plt.xticks(xtick_lab, xtick_labels) # plt.yticks(ytick_lab, ytick_lab) ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.2f')) # ax.xaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.0f')) grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', xlabel(r'Number of Classes $(k)$', labelpad=0) # labelpad=0 ylabel(r'Relative Accuracy', labelpad=0) xlim(2.9, 7.1) # ylim(0.65, 1.015) if CREATE_PDF: savefig(join(figure_directory, fig_filename), format='pdf', dpi=None, edgecolor='w', orientation='portrait', transparent=False, bbox_inches='tight', pad_inches=0.05, frameon=None) if SHOW_PLOT: plt.show() if SHOW_PDF: showfig(join(figure_directory, fig_filename)) # shows actually created PDF
def test_PaperExample(): print("\n-- 'estimateH': Example graph for MHE vs LHE paper example --") CHOICE = 1 if CHOICE == 1: # graph example X = np.array([ [0, 1], [0, 1], [1, 0], [1, 0], [0, 0], [0, 0], [0, 0], ]) elif CHOICE == 2: # full graph X = np.array([ [0, 1], [0, 1], [1, 0], [1, 0], [0, 1], [1, 0], [0, 1], ]) elif CHOICE == 3: # no neighbors connected X = np.array([ [0, 1], [0, 0], [1, 0], [1, 0], [0, 0], [0, 0], [0, 0], ]) Xb = to_explicit_bool_vector(X) X2c = to_centering_beliefs(X, ignoreZeroRows=True) # try without X2cf = to_centering_beliefs(X, ignoreZeroRows=False) # try without row = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3] col = [1, 4, 5, 2, 3, 5, 6, 4, 6, 6] row, col = row + col, col + row Ws = sparse.csr_matrix(([1] * len(row), (row, col)), shape=(7, 7)) # _out_visualize_Graph(Ws, X, Xb=Xb, colorDisplay='explicit') print("W:\n{}".format(Ws.todense())) print("X:\n{}\n".format(X)) start = time.time() H = estimateH(X, Ws, method='MHE') time_est = time.time() - start print("Estimated H (MHE):\n{}".format(H)) print("Time :{}\n".format(time_est)) start = time.time() H = estimateH(X, Ws, method='LHE') time_est = time.time() - start print("Estimated H (LHE):\n{}".format(H)) print("Time :{}\n".format(time_est)) start = time.time() H = estimateH(X, Ws, method='LHE', constraints=True) time_est = time.time() - start print("Estimated H (LHE) with constraints:\n{}".format(H)) print("Time :{}\n".format(time_est)) # start = time.time() # H = estimateH(X, Ws, method='LHEregular') # time_est = time.time() - start # print ("Estimated H (LHEregular):\n{}".format(H)) # print ("Time :{}\n".format(time_est)) # # start = time.time() # H = estimateH(X, Ws, method='LHE2') # time_est = time.time() - start # print ("Estimated H (LHE2):\n{}".format(H)) # print ("Time :{}\n".format(time_est)) print("= Variants with centered X -- ") start = time.time() H = estimateH(X2c, Ws, method='LHE') # print (X2c) time_est = time.time() - start print("Estimated H (LHE) with centering (while ignoring zero rows):\n{}". format(H)) print("Time :{}\n".format(time_est)) start = time.time() H = estimateH(X2cf, Ws, method='LHE') # print (X2cf) time_est = time.time() - start print("Estimated H (LHE) with centering (and NOT ignoring zero rows):\n{}". format(H)) print("Time :{}\n".format(time_est))
def beliefPropagation(X, W, P, numMaxIt=10, convergencePercentage=None, convergenceThreshold=0.9961947, debug=1, damping=1, clamping=False): """Standard belief propagation assuming a directed graph with two variants: V1: one directed potential across edge direction: P is one potential, and W contains the weights of edges V2: a set of potentials on different edges: P is a tensor, and W indexes the potentials Dimensions of P (2 or 3) determines variant. Uses message-passing with division: see [Koller,Friedman 2009] Section 10.3.1. Uses damping: see [Koller,Friedman 2009] Section 11.1. Can be run either with given number of maximal iterations or until specified percentage of nodes have converged. Convergence of a node is determined by (variant of) cosine similarity between *centered beliefs* from two iterations. If convergence criterium is reached, the iterations will stop before maximal iterations. Parameter "debug" allows alternative, more detailed outputs, e.g., to get intermediate belief values. Checks that every entry in X and P are > 0. Can model undirected graphs by (1) specifing every edge only for one direction, an d(2) using symmetric potentials. TODO: also implement version without message passing with division TODO: future variant with non-constant k and different potential dimensions TODO: future variant without echo cancellation TODO: alternative convergence condition: if np.allclose(x, x_new, atol=1e-10): break] TODO: clamping not necessary: all depends on relative strength of prior beliefs Parameters ---------- X : [n x k] np array prior (explicit) belief matrix. Rows do not have to be row-normalized. Rows can be all 0, which get later replaced by undefined prior belief. W : [n x n] sparse.csr_matrix directed sparse weighted adjacency matrix (thus a directed graph is assumed) Also allows undirected graph by simply specifying only symmetric potentials V1: weight determines thea ctual edge weight V2: weight determines the index of a potential (from potential tensor P) P : V1: [k x k] any directed potential (no requirement for normalization or identical row or column sums) V2: [num_pot_P x k x k] np array set of potentials (as tensor) numMaxIt : int (Default = 10) number of maximal iterations to perform convergencePercentage : float (Default = None) percentage of nodes that need to have converged in order to interrupt the iterations. Notice that a node with undefined beliefs does not count as converged if it does not change anymore (in order to avoid counting nodes without explicit beliefs as converged in first few rounds). If None, then runs until numMaxIt convergenceThreshold : float (Default = 0.9961947) cose similarity (actually, the "cosine_ratio" similarity) between two belief vectors in order to deem them as identicial (thus converged). In case both vectors have the same length, then: cos(5 deg) = 0.996194698092. cos(1 deg) = 0.999847695156 debug : int (Default = 1) 0 : no debugging and just returns F 1 : tests for correct input, and just returns F 2 : tests for correct input, and returns (F, actualNumIt, convergenceRatios) 3 : tests for correct input, and returns (list of F, list of convergenceRatios) damping : float (Default = 1) fraction of message values that come from new iteration (if 1, then no re-use of prior iteration) clamping : Boolean (Default = False) whether or not the explicit beliefs in X should be clamped to the nodes or not Returns (if debug == 0 or debug == 1) ------------------------------------- F : [n x k] np array final belief matrix, each row normalized to form a label distribution Returns (if debug == 2 ) ------------------------ F : [n x k] np array final belief matrix, each row normalized to form a label distribution actualNumIt : int actual number of iterations performed actualPercentageConverged : float percentage of nodes that converged Returns (if debug == 3 ) ------------------------ List of F : [(actualNumIt+1) x n x k] np array list of final belief matrices for each iteration, represented as 3-dimensional numpy array Also includes the original beliefs as first entry (0th iteration). Thus has (actualNumIt + 1) entries actualNumIt : int actual number of iterations performed (not counting the first pass = 0th iteration for initializing) List of actualPercentageConverged : list of float (with length actualNumIt) list of percentages of nodes that converged in each iteration > 0. Thus has actualNumIt entries """ # --- create variables for convergence checking and debugging n, k = X.shape dim_pot = len(P.shape) # dimensions 2 or 3: determines V1 or V2 Pot = P # for case of dim_pot = 2 if debug >= 1: assert (X >= 0).all(), "All explicit beliefs need to be >=0 " assert(issparse(W)), "W needs to be sparse" n2, n3 = W.shape assert type(P).__module__ == "numpy", "P needs to be numpy array (and not a matrix)" assert dim_pot in [2, 3], "Input Potentials need to be 2-dimensional or 3-dimensional" if dim_pot == 2: assert (P >= 0).all(), "All entries in the potentials need to be >=0 " k2, k3 = P.shape else: num_pot_P, k2, k3 = P.shape for P_entry in P: assert (P_entry >= 0).all(), "All entries in each potential need to be >=0 " assert W.dtype == int, "Entries of weight matrix need to be integers to reference index of the potential" weight = W.data set_pot = set(weight) max_pot_W = max(set_pot) assert max_pot_W <= set_pot, "Indices in W refering to P need to be smaller than the number of potentials" assert(n == n2 & n2 == n3), "X and W need to have compatible dimensions" assert(k == k2 & k2 == k3), "X and P need to have compatible dimensions" if debug >= 3: listF = [] # store the belief matrices for each iteration listConverged = [] # store all L2 norms to previous iteration # --- create edge dictionaries row, col = W.nonzero() nodes = set(np.concatenate((row, col))) dict_edges_out = {} # dictionary: i to all nodes j with edge (i->j) for node in nodes: dict_edges_out[node] = set() dict_edges_in = deepcopy(dict_edges_out) # dictionary: i to all nodes j with edge (i<-j) for (i,j) in zip(row, col): dict_edges_out[i].add(j) dict_edges_in[j].add(i) if dim_pot == 3: dict_edges_pot = {} # Dictionary: for each directed edge (i,j) -> index of the potential in P[index, :, :] for (i, j, d) in zip(row, col, weight): dict_edges_pot[(i, j)] = d # --- X -> X0: replace all-0-rows with all 1s (no need to normalize initial beliefs) implicitVector = 1-1*to_explicit_bool_vector(X) # indicator numpy array with 1s for rows with only 0s implicitVectorT = np.array([implicitVector]).transpose() # vertical 1 vector for implicit nodes X0 = X + implicitVectorT # X0: prio beliefs: addition of [n x k] matrix with [n x 1] vector is ok F1 = X0 # old F: only for checking convergence (either because convergencePercantage not None or debug >= 2) F2 = X0.astype(float) # new F: copy is necessary as to not change original X0 matrix when F2 is changed # --- Actual loop: each loop calculates (a) the new messages (with damping) and (b) the new beliefs converged = False actualNumIt = -1 # iterations start with 0th iteration while actualNumIt < numMaxIt and not converged: actualNumIt += 1 # --- (a) calculate messages if actualNumIt == 0: # --- first pass (counts as 0th iteration): create message dictionaries and initialize messages with ones dict_messages_along_1 = {} # dictionary: messages for each edge (i->j) in direction i->j dict_messages_against_1 = {} # dictionary: messages for each edge (i<-j) in direction i->j default = np.ones(k) # first message vector: all 1s for (i,j) in zip(row, col): dict_messages_along_1[(i,j)] = default dict_messages_against_1[(j,i)] = default else: # --- other iterations: calculate "messages_new" using message-passing with division (from F and messages) dict_messages_along_2 = {} # new dictionary: messages for each edge (i->j) in direction i->j dict_messages_against_2 = {} # new dictionary: messages for each edge (i<-j) in direction i->j for (i,j) in dict_messages_along_1.keys(): # also includes following case: "for (j,i) in dict_messages_against_1.keys()" if dim_pot == 3: # need to reference the correct potential in case dim_pot == 3 Pot = P[dict_edges_pot[(i,j)]-1, :, :] dict_messages_along_2[(i,j)] = (F2[i] / dict_messages_against_1[(j,i)]).dot(Pot) # entry-wise division dict_messages_against_2[(j,i)] = (F2[j] / dict_messages_along_1[(i,j)]).dot(Pot.transpose()) # TODO above two lines can contain errors # --- assign new to old message dictionaries, and optionally damp messages if damping == 1: dict_messages_along_1 = dict_messages_along_2.copy() # requires shallow copy because of later division dict_messages_against_1 = dict_messages_against_2.copy() else: for (i,j) in dict_messages_along_1.keys(): dict_messages_along_1[(i,j)] = damping*dict_messages_along_2[(i,j)] + \ (1-damping)*dict_messages_along_1[(i,j)] for (i,j) in dict_messages_against_1.keys(): dict_messages_against_1[(i,j)] = damping*dict_messages_against_2[(i,j)] + \ (1-damping)*dict_messages_against_1[(i,j)] # --- (b) create new beliefs by multiplying prior beliefs with all incoming messages (pointing in both directions) for (i, f) in enumerate(F2): if not clamping or implicitVector[i] == 0: # only update beliefs if those are not explicit and clamped F2[i] = X0[i] # need to start multiplying from explicit beliefs, referencing the row with separate variable did not work out for j in dict_edges_out[i]: # edges pointing away F2[i] *= dict_messages_against_1[(j,i)] for j in dict_edges_in[i]: # edges pointing inwards F2[i] *= dict_messages_along_1[(j,i)] # TODO line can contain errors # --- normalize beliefs [TODO: perhaps remove later to optimize except in last round] F2 = row_normalize_matrix(F2, norm='l1') # --- check convergence and store information if debug if convergencePercentage is not None or debug >= 2: F1z = to_centering_beliefs(F1) F2z = to_centering_beliefs(F2) actualPercentageConverged = matrix_convergence_percentage(F1z, F2z, threshold=convergenceThreshold) if convergencePercentage is not None \ and actualPercentageConverged >= convergencePercentage\ and actualNumIt > 0: # end the loop early converged = True F1 = F2.copy() # save for comparing in *next* iteration, make copy since F entries get changed if debug == 3: listF.append(F2.copy()) # stores (actualNumIt+1) values (copy is important as F2 is later overwritten) if actualNumIt > 0: listConverged.append(actualPercentageConverged) # stores actualNumIt values # --- Various return formats if debug <= 1: return F2 elif debug == 2: return F2, actualNumIt, actualPercentageConverged else: return np.array(listF), actualNumIt, listConverged
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False): global n global d global rep_SameGraph global FILENAMEZ global initial_h0 global H0c global exponent global length global variant global alpha_vec global beta_vec global gamma_vec global s_vec global clip_on_vec global numMaxIt_vec # Plotting Parameters global xtick_lab global xtick_labels global ytick_lab global xmax global xmin global ymin global ymax global labels global facecolor_vec global draw_std_vec global linestyle_vec global linewidth_vec global marker_vec global markersize_vec global legend_location global option_vec global learning_method_vec global Macro_Accuracy global EC global constraints global weight_vec global randomize_vec global k global fig_label global err global avoidNeighbors global convergencePercentage_W global stratified global gradient global doubly_stochastic global numberOfSplits global select_lambda_vec global lambda_vec global f_vec # -- Setup CHOICE = choice #500 Yelp, 600 Flickr, 700 DBLP, 800 Enron CREATE_DATA = create_data ADD_DATA = add_data SHOW_PDF = show_pdf SHOW_PLOT = show_plot CREATE_PDF = create_pdf STD_FILL = True CALCULATE_DATA_STATISTICS = False # -- Default Graph parameters rep_SameGraph = 3 # iterations on same graph initial_h0 = None # initial vector to start finding optimal H exponent = -0.3 length = 5 variant = 1 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 clip_on_vec = [True] * 10 numMaxIt_vec = [10] * 10 # Plotting Parameters xtick_lab = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1] xtick_labels = ['0.001\%', '0.01\%', '0.1\%', '1\%', '10\%', '100\%'] ytick_lab = np.arange(0, 1.1, 0.1) xmax = 1 xmin = 0.0001 ymin = 0.3 ymax = 0.7 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [0, 3, 4, 4, 4, 4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] Macro_Accuracy = False EC = True # Non-backtracking for learning constraints = True # True weight_vec = [None] * 3 + [10, 10] randomize_vec = [False] * 4 + [True] k = 3 err = 0 avoidNeighbors = False convergencePercentage_W = None stratified = True gradient = True doubly_stochastic = True draw_std_vec = range(10) numberOfSplits = 1 select_lambda_vec = [False] * 20 lambda_vec = None f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] FILENAMEZ = "" legend_location = "" fig_label = "" global exp_backoff exp_backoff = [2**n for n in range(6, 12)] def choose(choice): # -- Default Graph parameters global n global d global rep_SameGraph global FILENAMEZ global initial_h0 global exponent global length global variant global alpha_vec global beta_vec global gamma_vec global s_vec global clip_on_vec global numMaxIt_vec # Plotting Parameters global xtick_lab global xtick_labels global ytick_lab global xmax global xmin global ymin global ymax global labels global facecolor_vec global draw_std_vec global linestyle_vec global linewidth_vec global marker_vec global markersize_vec global legend_location global option_vec global learning_method_vec global Macro_Accuracy global EC global constraints global weight_vec global randomize_vec global k global fig_label global err global avoidNeighbors global convergencePercentage_W global stratified global gradient global doubly_stochastic global numberOfSplits global select_lambda_vec global lambda_vec global f_vec if choice == 0: None elif choice == 304: ## with varying weights FILENAMEZ = 'prop37' Macro_Accuracy = True fig_label = 'Prop37' legend_location = 'lower right' n = 62000 d = 34.8 select_lambda_vec = [False] * 5 # select_lambda_vec = [False] * 3 + [True] * 2 # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] # lambda_vec = [0.5] * 21 # same length as f_vec elif choice == 305: # Test row stochastic cases choose(304) doubly_stochastic = False # -- Yelp dataset elif choice == 501: FILENAMEZ = 'yelp' Macro_Accuracy = True weight_vec = [None] * 3 + [10, 10] gradient = True ymin = 0.1 ymax = 0.75 fig_label = 'Yelp' legend_location = 'upper left' n = 4301900 # for figure d = 6.56 # for figure # -- Flickr dataset elif choice == 601: FILENAMEZ = 'flickr' Macro_Accuracy = True fig_label = 'Flickr' legend_location = 'lower right' n = 2007369 d = 18.1 elif choice == 602: ## with varying weights choose(601) select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 603: ## with varying weights choose(602) select_lambda_vec = [False] * 3 + [ True ] * 2 # allow to choose lambda for different f in f_vec # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6 # same length as f_vec elif choice == 604: ## with weight = 1 draw_std_vec = [4] choose(603) lambda_vec = [0.5] * 21 # same length as f_vec # -- DBLP dataset elif choice == 701: FILENAMEZ = 'dblp.txt' Macro_Accuracy = True ymin = 0.2 ymax = 0.5 fig_label = 'DBLP' legend_location = 'lower right' n = 2241258 # for figure d = 26.11 # for figure # -- ENRON dataset elif choice == 801: FILENAMEZ = 'enron' Macro_Accuracy = True ymin = 0.3 ymax = 0.75 fig_label = 'Enron' f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] legend_location = 'upper left' n = 46463 # for figures d = 23.4 # for figures elif choice == 802: ### WITH ADAPTIVE WEIGHTS choose(801) select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 803: ### WITH ADAPTIVE WEIGHTS choose(802) lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [ 1 ] * 6 # same length as f_vec elif choice == 804: choose(803) elif choice == 805: choose(801) doubly_stochastic = False elif choice == 821: FILENAMEZ = 'enron' Macro_Accuracy = True constraints = True # True gradient = True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [0.2, 0.2] randomize_vec = [False] * 4 + [True] xmin = 0.0001 ymin = 0.0 ymax = 0.7 labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Enron' legend_location = 'lower right' n = 46463 # for figures d = 23.4 # for figures alpha = 0.0 beta = 0.0 gamma = 0.0 s = 0.5 numMaxIt = 10 select_lambda_vec = [False] * 3 + [True] * 2 lambda_vec = [0.2] * 13 + [10] * 8 # same length as f_vec captionText = "DCE weight=[0.2*13] [10*8], s={}, numMaxIt={}".format( s, numMaxIt) # -- Cora dataset elif choice == 901: FILENAMEZ = 'cora' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.001 ymin = 0.0 ymax = 0.9 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Cora' legend_location = 'lower right' n = 2708 d = 7.8 # -- Citeseer dataset elif CHOICE == 1001: FILENAMEZ = 'citeseer' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.001 ymin = 0.0 ymax = 0.75 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Citeseer' legend_location = 'lower right' n = 3312 d = 5.6 elif CHOICE == 1101: FILENAMEZ = 'hep-th' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.0001 ymin = 0.0 ymax = 0.1 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Hep-th' legend_location = 'lower right' n = 27770 d = 5.6 elif CHOICE == 1204: FILENAMEZ = 'pokec-gender' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.000015 ymin = 0.0 ymax = 0.75 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [0, 3, 4, 4, 4, 4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Pokec-Gender' legend_location = 'lower right' n = 1632803 d = 54.6 else: raise Warning("Incorrect choice!") choose(CHOICE) csv_filename = 'Fig_End-to-End_accuracy_{}_{}.csv'.format( CHOICE, FILENAMEZ) header = ['currenttime', 'method', 'f', 'precision', 'recall', 'accuracy'] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # print("choice: {}".format(CHOICE)) # --- print data statistics if CALCULATE_DATA_STATISTICS: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) d = (len(W.nonzero()[0]) * 2) / n print("FILENAMEZ:", FILENAMEZ) print("n:", n) print("d:", d) # -- Graph statistics n_vec = calculate_nVec_from_Xd(Xd) print("n_vec:\n", n_vec) d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd) print("d_vec:\n", d_vec) P = calculate_Ptot_from_graph(W, Xd) print("P:\n", P) # -- Various compatibilities H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) print("H0 w/ constraints:\n", np.round(H0, 2)) raw_input() H2 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H4 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H5 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H6 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H7 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) # print("H MCE w/o constraints:\n", np.round(H0, 3)) print("H MCE w/ constraints:\n", np.round(H2, 3)) # print("H DCE 2 w/o constraints:\n", np.round(H4, 3)) print("H DCE 2 w/ constraints:\n", np.round(H5, 3)) # print("H DCE 10 w/o constraints:\n", np.round(H6, 3)) print("H DCE 20 w/ constraints:\n", np.round(H7, 3)) H_row_vec = H_observed(W, X0, 3, NB=True, variant=1) print("H_est_1:\n", np.round(H_row_vec[0], 3)) print("H_est_2:\n", np.round(H_row_vec[1], 3)) print("H_est_3:\n", np.round(H_row_vec[2], 3)) # --- Create data if CREATE_DATA or ADD_DATA: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) ## number of nodes in graph d = (len(W.nonzero()[0]) * 2) / n # print(n) # print(d) # print("contraint = {}".format(constraints)) # --- Calculating True Compatibility matrix H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=constraints, gradient=gradient, doubly_stochastic=doubly_stochastic) # print(H0) H0c = to_centering_beliefs(H0) graph_workers = [] gq = multiprocessing.Queue() for j in range(rep_SameGraph): # repeat several times for same graph # print("Graph: {}".format(j)) graph_workers.append( multiprocessing.Process(target=graph_worker, args=(X0, W, gq))) for gw in graph_workers: gw.start() for gw in graph_workers: for t in exp_backoff: gw.join(t) if gw.exitcode is None: print( "failed to join graph worker {} after {} seconds, retrying" .format(gw, t)) else: continue print("Failed to join graph worker {}.".format(gw)) gq.put('STOP') for i in iter(gq.get, 'STOP'): save_csv_record(join(data_directory, csv_filename), i) # -- Read, aggregate, and pivot data for all options df1 = pd.read_csv(join(data_directory, csv_filename)) acc_filename = 'Fig_End-to-End_accuracy_realData{}_{}.pdf'.format( CHOICE, FILENAMEZ) pr_filename = 'Fig_End-to-End_PR_realData{}_{}.pdf'.format( CHOICE, FILENAMEZ) # generate_figure(data_directory, acc_filename, df1) # generate_figure(data_directory, pr_filename, df1, metric='pr') # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(5))) # Aggregate repetitions if "option" in df1.columns.values: pivot_col = "option" pivot_vec = option_vec else: pivot_col = "method" pivot_vec = learning_method_vec df2 = df1.groupby([pivot_col, 'f']).agg \ ({'accuracy': [np.mean, np.std, np.size], # Multiple Aggregates }) df2.columns = ['_'.join(col).strip() for col in df2.columns.values ] # flatten the column hierarchy df2.reset_index(inplace=True) # remove the index hierarchy df2.rename(columns={'accuracy_size': 'count'}, inplace=True) # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(500))) # Pivot table df3 = pd.pivot_table(df2, index='f', columns=pivot_col, values=['accuracy_mean', 'accuracy_std']) # Pivot # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30))) df3.columns = ['_'.join(col).strip() for col in df3.columns.values ] # flatten the column hierarchy df3.reset_index(inplace=True) # remove the index hierarchy # df2.rename(columns={'time_size': 'count'}, inplace=True) # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(5))) # Extract values X_f = df3['f'].values # plot x values Y = [] Y_std = [] for val in pivot_vec: Y.append(df3['accuracy_mean_{}'.format(val)].values) if STD_FILL: Y_std.append(df3['accuracy_std_{}'.format(val)].values) if CREATE_PDF or SHOW_PDF or SHOW_PLOT: print("Setting up figure...") # -- Setup figure # remove 4 last characters ".txt" fig_filename = 'Fig_End-to-End_accuracy_realData{}_{}.pdf'.format( CHOICE, FILENAMEZ) mpl.rc( 'font', **{ 'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans'] }) mpl.rcParams['axes.labelsize'] = 20 mpl.rcParams['xtick.labelsize'] = 16 mpl.rcParams['ytick.labelsize'] = 16 mpl.rcParams['legend.fontsize'] = 14 # 6 mpl.rcParams['grid.color'] = '777777' # grid color mpl.rcParams[ 'xtick.major.pad'] = 2 # padding of tick labels: default = 4 mpl.rcParams[ 'ytick.major.pad'] = 1 # padding of tick labels: default = 4 mpl.rcParams['xtick.direction'] = 'out' # default: 'in' mpl.rcParams['ytick.direction'] = 'out' # default: 'in' mpl.rcParams['axes.titlesize'] = 16 mpl.rcParams['figure.figsize'] = [4, 4] fig = figure() ax = fig.add_axes([0.13, 0.17, 0.8, 0.8]) # -- Drawing if STD_FILL: for choice, (option, facecolor) in enumerate(zip(option_vec, facecolor_vec)): if choice in draw_std_vec: ax.fill_between(X_f, Y[choice] + Y_std[choice], Y[choice] - Y_std[choice], facecolor=facecolor, alpha=0.2, edgecolor=None, linewidth=0) ax.plot(X_f, Y[choice] + Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid') ax.plot(X_f, Y[choice] - Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid') for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \ enumerate(zip(option_vec, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)): ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker, markersize=markersize, markeredgewidth=1, clip_on=clip_on) # -- Title and legend if n < 1000: n_label = '{}'.format(n) else: n_label = '{}k'.format(int(n / 1000)) title(r'$\!\!\!\!\!\!\!${}: $n={}, d={}$'.format( fig_label, n_label, np.round(d, 1))) handles, labels = ax.get_legend_handles_labels() legend = plt.legend( handles, labels, loc=legend_location, # 'upper right' handlelength=2, labelspacing=0, # distance between label entries handletextpad= 0.3, # distance between label and the line representation # title='Variants', borderaxespad=0.2, # distance between legend and the outer axes borderpad=0.3, # padding inside legend box numpoints=1, # put the marker only once ) # # legend.set_zorder(1) frame = legend.get_frame() frame.set_linewidth(0.0) frame.set_alpha(0.9) # 0.8 plt.xscale('log') # -- Figure settings and save plt.xticks(xtick_lab, xtick_labels) plt.yticks(ytick_lab, ytick_lab) # Only show ticks on the left and bottom spines ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f')) grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', xlabel(r'Label Sparsity $(f)$', labelpad=0) # labelpad=0 ylabel(r'Accuracy', labelpad=0) xlim(xmin, xmax) ylim(ymin, ymax) if CREATE_PDF: print("saving PDF of figure...") savefig(join(figure_directory, fig_filename), format='pdf', dpi=None, edgecolor='w', orientation='portrait', transparent=False, bbox_inches='tight', pad_inches=0.05, frameon=None) if SHOW_PLOT: print("Showing plot...") plt.show() if SHOW_PDF: print("Showing pdf...") showfig(join(figure_directory, fig_filename)) # shows actually created PDF
def run(choice, variant, create_data=False, add_data=False, create_graph=False, create_fig=True, show_plot=False, create_pdf=False, show_pdf=False, shorten_length=False, show_arrows=True): """main parameterized method to produce all figures. Can be run from external jupyther notebook or method to produce all figures in PDF """ # -- Setup CHOICE = choice # determines the CSV data file to use VARIANT = variant # determines the variant of how the figures are plotted CREATE_DATA = create_data # starts new CSV file and stores experimental timing results ADD_DATA = add_data # adds data to existing file CREATE_GRAPH = create_graph # creates the actual graph for experiments (stores W and X in CSV files) SHOW_PDF = show_pdf SHOW_PLOT = show_plot CREATE_FIG = create_fig CREATE_PDF = create_pdf SHORTEN_LENGTH = shorten_length # to prune certain fraction of data to plot SHOW_SCALING_LABELS = True # first entry in the legend is for the dashed line of scalability SHOW_TITLE = True # show parameters in title of plot SHOW_DCER_WITH_BOX = True # show DCER value in a extra box LABEL_FONTSIZE = 16 # size of number labels in figure SHOW_LINEAR = True # show dashed line for linear scaling SHOW_ARROWS = show_arrows # show extra visual comparison of speed-up csv_filename = 'Fig_Timing_{}.csv'.format( CHOICE) # CSV filename includes CHOICE filename = 'Fig_Timing_{}-{}'.format( CHOICE, VARIANT) # PDF filename includes CHOICE and VARIANT header = ['n', 'type', 'time'] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # -- Default Graph parameters distribution = 'powerlaw' exponent = -0.3 k = 3 a = 1 # this value was erroneously set to 5 previously!!! TODO: fix everywhere else # err = 0 avoidNeighbors = False f = 0.1 est_EC = True # !!! TODO: for graph estimation weights = 10 pyamg = False convergencePercentage_W = None alpha = 0 beta = 0 gamma = 0 s = 0.5 numMaxIt = 10 xtick_lab = [0.001, 0.01, 0.1, 1] ytick_lab = np.arange(0, 1, 0.1) xmin = 1e2 xmax = 1e8 # xmax = 1e6 ymin = 1e-3 ymax = 5e3 color_vec = [ "#4C72B0", "#55A868", "#8172B2", "#C44E52", "#CCB974", 'black', 'black', "#64B5CD", "black" ] marker_vec = ['s', '^', 'x', 'o', 'None', 'None', 'None', 'None'] linestyle_vec = ['solid'] * 6 + ['dashed'] linewidth_vec = [3] * 3 + [4, 3, 4] + [3] * 7 SHOWMAXNUMBER = True show_num_vec = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop', 'eps_max'] # %% -- Main Options if CHOICE == 3: n_vec = [ 100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200, 102400, 204800, 409600, 819200, 1638400, 3276800, 6553600 ] # # n_vec = [1638400] # graph: 12021 sec = 3.4h, 18600 sec = 5h, 21824 sec (34000 sec old laptop) # # n_vec = [3276800] # graph: 49481 sec = 13.8h, 68145 sec (125233 sec old laptop) # # n_vec = [6553600] # graph: 145020 sec = 40h h = 8 d = 5 repeat_vec_vec = [[ 50, 50, 50, 50, 50, 50, 50, 20, 10, 10, 5, 5, 5, 3, 3, 3, 3 ], [5, 5, 5, 5, 3, 3, 3, 3, 3, 1, 1], [20, 20, 20, 10, 10, 10, 10, 10, 5, 5, 5, 3, 3, 1, 1, 1, 1]] method_vec_vec = [['MHE', 'DHE', 'DHEr', 'LHE'], ['Holdout'], ['prop']] if VARIANT == 1: method_vec_fig = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop'] label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop'] show_num_vec = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop'] if VARIANT == 2: # version used for main paper figure method_vec_fig = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop'] label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop'] linestyle_vec = ['solid'] * 5 + ['dashed'] SHOW_ARROWS = False if VARIANT == 3: # version used for main paper figure method_vec_fig = ['DHEr', 'Holdout', 'prop'] label_vec = [ 'DCEr', 'Holdout', 'Propagation', '$\epsilon_{\mathrm{max}}$' ] linestyle_vec = ['solid'] * 2 + ['dashed'] color_vec = [ "#C44E52", "#CCB974", 'black', 'black', "#64B5CD", "black" ] marker_vec = ['o', 'x', 'None', 'None', 'None'] linestyle_vec = ['solid'] * 3 + ['dashed'] linewidth_vec = [4, 3, 4] + [3] * 7 ymin = 1e-2 SHOW_ARROWS = True if VARIANT == 4: # figure used in slides method_vec_fig = ['prop'] label_vec = ['Propagation'] color_vec = ['black'] marker_vec = ['None'] linestyle_vec = ['solid'] * 1 linewidth_vec = [2] ymin = 1e-2 SHOW_ARROWS = False SHOW_SCALING_LABELS = False SHOW_TITLE = False SHOW_DCER_WITH_BOX = False LABEL_FONTSIZE = 20 SHOW_LINEAR = False if VARIANT == 5: # figure used in slides method_vec_fig = ['prop', 'Holdout'] label_vec = ['Propagation', 'Baseline'] color_vec = ['black', "#CCB974"] marker_vec = ['None', '^'] linestyle_vec = ['solid'] * 2 linewidth_vec = [2, 4] ymin = 1e-2 SHOW_ARROWS = True SHOW_SCALING_LABELS = False SHOW_TITLE = False SHOW_DCER_WITH_BOX = False LABEL_FONTSIZE = 20 SHOW_LINEAR = False if VARIANT == 6: # figure used in slides method_vec_fig = ['prop', 'Holdout', 'DHEr'] label_vec = ['Propagation', 'Baseline', 'Our method'] color_vec = ['black', "#CCB974", "#C44E52"] marker_vec = ['None', '^', 'o', 'None', 'None'] linestyle_vec = ['solid'] + ['solid'] * 2 linewidth_vec = [2, 4, 4] ymin = 1e-2 SHOW_ARROWS = True SHOW_SCALING_LABELS = False SHOW_TITLE = True SHOW_DCER_WITH_BOX = False LABEL_FONTSIZE = 20 SHOW_LINEAR = False graph_cvs = 'Fig_Timing_SSLH_1' # re-use existing large graphs elif CHOICE == 4: n_vec = [ 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200, 102400, 204800, 409600, 819200, ] # n_vec = [819200] # graph: 47905 sec = 13.3h. 90562 sec = 25h (180527 sec old laptop) h = 3 d = 25 repeat_vec_vec = [[ 50, 50, 50, 50, 50, 50, 20, 10, 10, 5, 3, 3, 3, ], [5, 5, 5, 3, 1, 1, 1, 1, 1], [ 20, 20, 10, 10, 10, 10, 10, 5, 5, 5, 1, 1, 1, ]] method_vec_vec = [['MHE', 'DHE', 'DHEr', 'LHE'], ['Holdout'], ['prop']] VARIANT = 2 if VARIANT == 1: method_vec_fig = [ 'MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop', 'eps_max' ] label_vec = [ 'MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop', '$\epsilon_{\mathrm{max}}$' ] show_num_vec = [ 'MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop', 'eps_max' ] if VARIANT == 2: method_vec_fig = ['MHE', 'LHE', 'DHE', 'DHEr', 'Holdout', 'prop'] label_vec = ['MCE', 'LCE', 'DCE', 'DCEr', 'Holdout', 'prop'] linestyle_vec = ['solid'] * 5 + ['dashed'] if VARIANT == 3: method_vec_fig = ['DHEr', 'Holdout', 'prop'] label_vec = [ 'DCEr', 'Holdout', 'Propagation', '$\epsilon_{\mathrm{max}}$' ] linestyle_vec = ['solid'] * 2 + ['dashed'] color_vec = [ "#C44E52", "#CCB974", 'black', 'black', "#64B5CD", "black" ] marker_vec = ['o', 'x', 'None', 'None', 'None'] linestyle_vec = ['solid'] * 3 + ['dashed'] linewidth_vec = [4, 3, 4] + [3] * 7 ymin = 1e-2 graph_cvs = 'Fig_Timing_SSLH_2' # re-use existing large graphs xmin = 1e3 xmax = 5e7 ymax = 1e3 elif CHOICE == 2: # rep_Estimation = 10 # n_vec = [200, 400, 800, 1600, 3200, 6400, 12800, # 25600, 51200, 102400, 204800, 409600, 819200] # repeat_vec = [20, 20, 20, 20, 20, 10, 10, # 10, 10, 10, 5, 5, 1] # n_vec = [819200] # graph: 47905 sec = 13.3h. 90562 sec = 25h (180527 sec old laptop) n_vec = [1638400] # !!! not done yet repeat_vec = [1] h = 3 d = 25 xmax = 5e7 graph_cvs = 'Fig_Timing_SSLH_2' elif CHOICE == 10: # same as 3 but with difference bars n_vec = [ 100, 200, 400, 800, 1600, 3200, 6400, 12800, 25600, 51200, 102400, 204800, 409600, 819200, 1638400, 3276800, 6553600 ] # # n_vec = [1638400] # graph: 12021 sec = 3.4h, 18600 sec = 5h, 21824 sec (34000 sec old laptop) # # n_vec = [3276800] # graph: 49481 sec = 13.8h, 68145 sec (125233 sec old laptop) # # n_vec = [6553600] # graph: 145020 sec = 40h h = 8 d = 5 repeat_vec_vec = [[ 50, 50, 50, 50, 50, 50, 50, 20, 10, 10, 5, 5, 5, 3, 3, 3, 3 ], [5, 5, 5, 5, 3, 3, 3, 3, 3, 1, 1], [20, 20, 20, 10, 10, 10, 10, 10, 5, 5, 5, 3, 3, 1, 1, 1, 1]] method_vec_vec = [['MHE', 'DHE', 'DHEr', 'LHE'], ['Holdout'], ['prop']] method_vec_fig = ['DHEr', 'Holdout', 'prop'] label_vec = [ 'DCEr', 'Holdout', 'Propagation', '$\epsilon_{\mathrm{max}}$' ] linestyle_vec = ['solid'] * 2 + ['dashed'] color_vec = [ "#C44E52", "#CCB974", 'black', 'black', "#64B5CD", "black" ] marker_vec = ['o', 'x', 'None', 'None', 'None'] linestyle_vec = ['solid'] * 3 + ['dashed'] linewidth_vec = [4, 3, 4] + [3] * 7 ymin = 1e-2 graph_cvs = 'Fig_Timing_SSLH_1' # re-use existing large graphs else: raise Warning("Incorrect choice!") # %% -- Common options alpha0 = np.array([a, 1., 1.]) alpha0 = alpha0 / np.sum(alpha0) H0 = create_parameterized_H(k, h, symmetric=True) H0c = to_centering_beliefs(H0) RANDOMSEED = None # For repeatability random.seed(RANDOMSEED) # seeds some other python random generator np.random.seed( seed=RANDOMSEED ) # seeds the actually used numpy random generator; both are used and thus needed # print("CHOICE: {}".format(CHOICE)) def save_tuple(n, label, time): tuple = [str(datetime.datetime.now())] text = [n, label, time] tuple.extend(text) print("time potential {}: {}".format(label, time)) save_csv_record(join(data_directory, csv_filename), tuple) # %% -- Create data if CREATE_DATA or ADD_DATA: for repeat_vec, method_vec in zip(repeat_vec_vec, method_vec_vec): for n, repeat in zip(n_vec, repeat_vec): print("\nn: {}".format(n)) # repeat = repeat_vec[j] # -- Graph if CREATE_GRAPH: start = time.time() W, Xd = planted_distribution_model( n, alpha=alpha0, P=H0, m=d * n, distribution=distribution, exponent=exponent, directed=False, debug=False) X0 = from_dictionary_beliefs(Xd) time_graph = time.time() - start save_W(join(data_directory, '{}_{}_W.csv'.format(graph_cvs, n)), W, saveWeights=False) save_X( join(data_directory, '{}_{}_X.csv'.format(graph_cvs, n)), X0) save_tuple(n, 'graph', time_graph) else: W, _ = load_W(join(data_directory, '{}_{}_W.csv'.format(graph_cvs, n)), skiprows=1, zeroindexing=True, n=None, doubleUndirected=False) X0, _, _ = load_X(join(data_directory, '{}_{}_X.csv'.format(graph_cvs, n)), n=None, k=None, skiprows=1, zeroindexing=True) # -- Repeat loop for i in range(repeat): print("\n repeat: {}".format(i)) X2, ind = replace_fraction_of_rows( X0, 1 - f, avoidNeighbors=avoidNeighbors, W=W) for method in method_vec: if method == 'DHE': start = time.time() H2 = estimateH(X2, W, method='DHE', variant=1, distance=5, EC=est_EC, weights=weights) time_est = time.time() - start save_tuple(n, 'DHE', time_est) elif method == 'DHEr': start = time.time() H2 = estimateH(X2, W, method='DHE', variant=1, distance=5, EC=est_EC, weights=weights, randomize=True) time_est = time.time() - start save_tuple(n, 'DHEr', time_est) elif method == 'MHE': start = time.time() H2 = estimateH(X2, W, method='MHE', variant=1, distance=1, EC=est_EC, weights=None) time_est = time.time() - start save_tuple(n, 'MHE', time_est) elif method == 'LHE': start = time.time() H2 = estimateH(X2, W, method='LHE', variant=1, distance=1, EC=est_EC, weights=None) time_est = time.time() - start save_tuple(n, 'LHE', time_est) elif method == 'Holdout': start = time.time() H2 = estimateH_baseline_serial( X2, ind, W, numMax=numMaxIt, numberOfSplits=1, # EC=EC, # weights=weight, alpha=alpha, beta=beta, gamma=gamma) time_est = time.time() - start save_tuple(n, 'Holdout', time_est) elif method == 'prop': H2c = to_centering_beliefs(H0) X2c = to_centering_beliefs( X2, ignoreZeroRows=True) # try without start = time.time() eps_max = eps_convergence_linbp_parameterized( H2c, W, method='noecho', alpha=alpha, beta=beta, gamma=gamma, X=X2, pyamg=pyamg) time_eps_max = time.time() - start save_tuple(n, 'eps_max', time_eps_max) # -- Propagate eps = s * eps_max try: start = time.time() F, actualIt, actualPercentageConverged = \ linBP_symmetric_parameterized(X2, W, H2c * eps, method='noecho', alpha=alpha, beta=beta, gamma=gamma, numMaxIt=numMaxIt, convergencePercentage=convergencePercentage_W, debug=2) time_prop = time.time() - start except ValueError as e: print("ERROR: {}: d={}, h={}".format(e, d, h)) else: save_tuple(n, 'prop', time_prop) else: raise Warning("Incorrect choice!") # %% -- Read, aggregate, and pivot data for all options df1 = pd.read_csv(join(data_directory, csv_filename)) # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(50))) # Aggregate repetitions df2 = df1.groupby(['n', 'type']).agg \ ({'time': [np.mean, np.median, np.std, np.size], # Multiple Aggregates }) df2.columns = ['_'.join(col).strip() for col in df2.columns.values ] # flatten the column hierarchy df2.reset_index(inplace=True) # remove the index hierarchy df2.rename(columns={'time_size': 'count'}, inplace=True) # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(15))) # Pivot table df3 = pd.pivot_table(df2, index=['n'], columns=['type'], values=['time_mean', 'time_median']) # Pivot # df3 = pd.pivot_table(df2, index=['n'], columns=['type'], values=['time_mean', 'time_median', 'time_std'] ) # Pivot # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30))) df3.columns = ['_'.join(col).strip() for col in df3.columns.values ] # flatten the column hierarchy df3.reset_index(inplace=True) # remove the index hierarchy # df2.rename(columns={'time_size': 'count'}, inplace=True) # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30))) # Extract values X = df3['n'].values # plot x values X = X * d / 2 # calculate edges (!!! notice dividing by 2 as one edge appears twice in symmetric adjacency matrix) Y = {} for method in method_vec_fig: # Y[method] = df3['time_mean_{}'.format(method)].values Y[method] = df3['time_median_{}'.format(method)].values if SHORTEN_LENGTH: SHORT_FACTOR = 4 ## KEEP EVERY Nth ELEMENT X = np.copy(X[list(range(0, len(X), SHORT_FACTOR)), ]) for method in method_vec_fig: Y[method] = np.copy( Y[method][list(range(0, len(Y[method]), SHORT_FACTOR)), ]) # %% -- Figure if CREATE_FIG: fig_filename = '{}.pdf'.format( filename) # TODO: repeat pattern in other files mpl.rcParams['backend'] = 'agg' mpl.rcParams['lines.linewidth'] = 3 mpl.rcParams['font.size'] = LABEL_FONTSIZE mpl.rcParams['axes.labelsize'] = 20 mpl.rcParams['axes.titlesize'] = 16 mpl.rcParams['xtick.labelsize'] = 16 mpl.rcParams['ytick.labelsize'] = 16 mpl.rcParams['legend.fontsize'] = 12 mpl.rcParams['axes.edgecolor'] = '111111' # axes edge color mpl.rcParams['grid.color'] = '777777' # grid color mpl.rcParams['figure.figsize'] = [4, 4] mpl.rcParams[ 'xtick.major.pad'] = 4 # padding of tick labels: default = 4 mpl.rcParams[ 'ytick.major.pad'] = 4 # padding of tick labels: default = 4 fig = plt.figure() ax = fig.add_axes([0.13, 0.17, 0.8, 0.8]) # -- Draw the plots if SHOW_LINEAR: ax.plot([1, 1e8], [1e-5, 1e3], linewidth=1, color='gray', linestyle='dashed', label='1sec/100k edges', clip_on=True, zorder=3) for i, (method, color, marker, linewidth, linestyle) in enumerate( zip(method_vec_fig, color_vec, marker_vec, linewidth_vec, linestyle_vec)): ax.plot(X, Y[method], linewidth=linewidth, color=color, linestyle=linestyle, label=label_vec[i], clip_on=True, marker=marker, markersize=6, markeredgewidth=1, markeredgecolor='black', zorder=4) # for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \ # enumerate(zip(option_vec, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)): # P = ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker, # markersize=markersize, markeredgewidth=1, markeredgecolor='black', clip_on=clip_on) if SHOWMAXNUMBER and method in show_num_vec: if method == 'DHEr' and SHOW_DCER_WITH_BOX: j = np.argmax(np.ma.masked_invalid( Y[method])) # mask nan, then get index of max element ax.annotate(int(np.round(Y[method][j])), xy=(X[j] * 1.5, Y[method][j]), color=color, va='center', bbox=dict(boxstyle="round,pad=0.3", fc="w"), annotation_clip=False, zorder=5) else: j = np.argmax(np.ma.masked_invalid( Y[method])) # mask nan, then get index of max element ax.annotate(int(np.round(Y[method][j])), xy=(X[j] * 1.5, Y[method][j]), color=color, va='center', annotation_clip=False, zorder=5) if SHOW_ARROWS: dce_opt = 'DHEr' holdout_opt = 'Holdout' prop_opt = 'prop' j_holdout = np.argmax(np.ma.masked_invalid(Y[holdout_opt])) if dce_opt in Y: j_dce = np.argmax(np.ma.masked_invalid(Y[dce_opt])) ax.annotate(s='', xy=(X[j_dce], Y[prop_opt][j_dce]), xytext=(X[j_dce], Y[dce_opt][j_dce]), arrowprops=dict(arrowstyle='<->')) ax.annotate( str(int(np.round(Y[prop_opt][j_dce] / Y[dce_opt][j_dce]))) + 'x', xy=(X[j_dce], int(Y[prop_opt][j_dce] + Y[dce_opt][j_dce]) / 6), color='black', va='center', fontsize=14, # bbox = dict(boxstyle="round,pad=0.3", fc="w"), annotation_clip=False, zorder=5) ax.annotate(s='', xy=(X[j_holdout], Y[holdout_opt][j_holdout]), xytext=(X[j_holdout], Y[dce_opt][j_holdout]), arrowprops=dict(arrowstyle='<->')) ax.annotate( str( int( np.round(Y[holdout_opt][j_holdout] / Y[dce_opt][j_holdout]))) + 'x', xy=(X[j_holdout], int(Y[holdout_opt][j_holdout] + Y[dce_opt][j_holdout]) / 8), color='black', va='center', fontsize=14, # bbox = dict(boxstyle="round,pad=0.3", fc="w"), annotation_clip=False, zorder=5) else: # in case dce_opt not shown, then show arrow as compared to prop method ax.annotate(s='', xy=(X[j_holdout], Y[holdout_opt][j_holdout]), xytext=(X[j_holdout], Y[prop_opt][j_holdout]), arrowprops=dict(arrowstyle='<->')) ax.annotate( str( int( np.round(Y[holdout_opt][j_holdout] / Y[prop_opt][j_holdout]))) + 'x', xy=(X[j_holdout], int(Y[holdout_opt][j_holdout] + Y[prop_opt][j_holdout]) / 8), color='black', va='center', fontsize=14, # bbox = dict(boxstyle="round,pad=0.3", fc="w"), annotation_clip=False, zorder=5) if SHOW_TITLE: plt.title(r'$\!\!\!d\!=\!{}, h\!=\!{}$'.format(d, h)) handles, labels = ax.get_legend_handles_labels() if not SHOW_SCALING_LABELS and SHOW_LINEAR: handles = handles[1:] labels = labels[1:] legend = plt.legend( handles, labels, loc='upper left', # 'upper right' handlelength=2, labelspacing=0, # distance between label entries handletextpad= 0.3, # distance between label and the line representation borderaxespad=0.2, # distance between legend and the outer axes borderpad=0.3, # padding inside legend box numpoints=1, # put the marker only once ) legend.set_zorder(3) frame = legend.get_frame() frame.set_linewidth(0.0) frame.set_alpha(0.2) # 0.8 # -- Figure settings and save plt.minorticks_on() plt.xscale('log') plt.yscale('log') minorLocator = LogLocator( base=10, subs=[0.1 * n for n in range(1, 10)], numticks=40 ) # TODO: discuss with Paul trick that helped with grid lines last time; necessary in order to create the log locators (otherwise does now show the wanted ticks # ax.xaxis.set_minor_locator(minorLocator) plt.xticks([1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9]) plt.grid(True, which='both', axis='both', alpha=0.2, linestyle='-', linewidth=1, zorder=1) # linestyle='dashed', which='minor', axis='y', # grid(b=True, which='minor', axis='x', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', plt.xlabel(r'Number of edges ($m$)', labelpad=0) # labelpad=0 plt.ylabel(r'Time [sec]', labelpad=0) plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) # print(ax.get_xaxis().get_minor_locator()) if CREATE_PDF: plt.savefig( join(figure_directory, fig_filename), format='pdf', dpi=None, edgecolor='w', orientation='portrait', transparent=False, bbox_inches='tight', pad_inches=0.05, # frameon=None ) if SHOW_PDF: showfig(join(figure_directory, fig_filename)) # shows actually created PDF if SHOW_PLOT: plt.show()
def test_linBP_symmetric_Torus(): # Shows that with s>1 LinBP will diverge and v.v., for Torus graph # Interesting is that with H (instead of Hc) and echo=True, just above s=1, the oscillations can start late print("\n-- 'linBP_symmetric', 'eps_convergence_linbp', with Torus --") # -- Load W, create X and P W, n = load_W(join(data_directory, 'Torus_W.csv'), zeroindexing=False) X = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]) H = np.array([[0.1, 0.8, 0.1], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8]]) print("W:\n", W.todense()) print("X:\n", X) Xc = to_centering_beliefs(X, ignoreZeroRows=True) print("Xc:\n", Xc) Xl = to_explicit_list(X) print("Xl:\n", Xl) print("H:\n", H) Hc = to_centering_beliefs(H) print("Hc:\n", Hc) # -- Other eps_max for 3 x 2 methods print("\neps_max without echo and Hc:") print(" eps_max (W): ", eps_convergence_linbp(Hc, W)) print("eps_max with echo and Hc:") print(" eps_max (W): ", eps_convergence_linbp(Hc, W, echo=True)) print("eps_max with echo and compensation and Hc:") print(" eps_max (W): ", eps_convergence_linbp(Hc, W, echo=True, compensation=True)) print("\neps_max without echo and H:") print(" eps_max (W): ", eps_convergence_linbp(H, W)) print("eps_max with echo and H:") print(" eps_max (W): ", eps_convergence_linbp(H, W, echo=True)) print("eps_max with echo and compensation and H:") print(" eps_max (W): ", eps_convergence_linbp(H, W, echo=True, compensation=True)) # -- Define parameters and run LinBP print("\nActual run with various parameters") s = 1.15 # 0.4 numMaxIt = 200 echo = True convergencePercentage = None # 0.5 convergenceThreshold = 0.99 eps_max = eps_convergence_linbp(H, W, echo=echo) print("eps:", s) print("echo:", echo) listF, actualNumIt, listConverged = linBP_symmetric( Xc, W, H * eps_max * s, echo=echo, numMaxIt=numMaxIt, convergencePercentage=convergencePercentage, convergenceThreshold=convergenceThreshold, debug=3) # # -- Display BP results print("\nlinBP results:") print( "Notice that we get identical results with X or Xc, and for Hc or H (except for convergence)" ) print("\nlast two F:") print(listF[-2]) print(listF[-1]) print("actualNumIt:", actualNumIt) print("listConverged:\n", listConverged) # print("\nValues for node 6 (zero indexing):" # print listF[:, 6, :] # print("all:\n", listF # -- Visualize BP results filename = join(fig_directory, 'Fig_temp_SSLH_inference.pdf') print("\nVisualize values for node 3 (zero indexing):") node = 3 plt.plot(listF[:, node, :], lw=2) plt.xlabel('# iterations') plt.ylabel('belief') plt.xlim(0, numMaxIt) print(filename) plt.savefig(filename, dpi=None, facecolor='w', edgecolor='w', orientation='portrait', papertype='letter', format='pdf', transparent=True, bbox_inches='tight', pad_inches=0.1) os.system("chmod 744 " + filename) # first change permissions in order to open PDF os.system("open " + filename) # open PDF