def test_graph_statistics_forced_block_model(): print("\n--- test_graph_statistics_forced_block_model() ---") H0 = np.array([[0.1, 0.8, 0.1], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8]]) alpha0 = np.array([0.4, 0.3, 0.3]) print("alpha0: ", alpha0) print("H0:\n", H0) print("\n") n = 40 b = 2 start = time.time() Ws, X = graphGenerator(n, b, H=H0, alpha=alpha0, model='CBM', seed=None, directed=True) time_est = time.time()-start print("Time for graph generation: ", time_est) print("\n") Xd = to_dictionary_beliefs(X) n_vec = calculate_nVec_from_Xd(Xd) P_tot = calculate_Ptot_from_graph(Ws, Xd) H = row_normalize_matrix(P_tot) print("n_vec: ", n_vec) print("alpha: ", 1.*n_vec / sum(n_vec)) print("P_tot:\n", P_tot) print("P:\n", 1. * P_tot / sum(P_tot.flatten())) # Potential: normalized sum = 1 print("H:\n", H) d_vec = calculate_outdegree_distribution_from_graph(Ws, Xd=None) print("Indegree distribution:\n", d_vec) d_vec_list = calculate_outdegree_distribution_from_graph(Ws, Xd) print("List of indegree distributions:") for dict in d_vec_list: print(" ", dict)
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False): global n global d global rep_SameGraph global FILENAMEZ global csv_filename global initial_h0 global exponent global length global variant global alpha_vec global beta_vec global gamma_vec global s_vec global clip_on_vec global numMaxIt_vec # Plotting Parameters global xtick_lab global xtick_labels global ytick_lab global xmax global xmin global ymin global ymax global labels global facecolor_vec global draw_std_vec global linestyle_vec global linewidth_vec global marker_vec global markersize_vec global legend_location global option_vec global learning_method_vec global Macro_Accuracy global EC global constraints global weight_vec global randomize_vec global k global err global avoidNeighbors global convergencePercentage_W global stratified global gradient global doubly_stochastic global num_restarts global numberOfSplits global H_heuristic global select_lambda_vec global lambda_vec global f_vec global H0c # -- Setup CHOICE = choice #300 Prop37, 400 MovieLens, 500 Yelp, 600 Flickr, 700 DBLP, 800 Enron experiments = [CHOICE] CREATE_DATA = create_data ADD_DATA = add_data SHOW_PDF = show_pdf SHOW_PLOT = show_plot CREATE_PDF = create_pdf SHOW_FIG = SHOW_PLOT or SHOW_PDF or CREATE_PDF STD_FILL = True TIMING = False CALCULATE_DATA_STATISTICS = False # -- Default Graph parameters rep_SameGraph = 10 # iterations on same graph initial_h0 = None # initial vector to start finding optimal H exponent = -0.3 length = 5 variant = 1 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 clip_on_vec = [True] * 10 numMaxIt_vec = [10] * 10 # Plotting Parameters xtick_lab = [0.001, 0.01, 0.1, 1] xtick_labels = ['0.1\%', '1\%', '10\%', '100\%'] ytick_lab = np.arange(0, 1.1, 0.1) xmax = 1 xmin = 0.0001 ymin = 0.3 ymax = 0.7 labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [False] * 4 + [True] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] Macro_Accuracy = False EC = True # Non-backtracking for learning constraints = True # True weight_vec = [None] * 3 + [10, 10] * 2 randomize_vec = [False] * 4 + [True] * 2 k = 3 err = 0 avoidNeighbors = False convergencePercentage_W = None stratified = True gradient = True doubly_stochastic = True num_restarts = None raw_std_vec = range(10) numberOfSplits = 1 select_lambda_vec = [False] * 20 lambda_vec = None f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] FILENAMEZ = "" legend_location = "" fig_label = "" H_heuristic = "" def choose(choice): global n global d global rep_SameGraph global FILENAMEZ global initial_h0 global exponent global length global variant global alpha_vec global beta_vec global gamma_vec global s_vec global clip_on_vec global numMaxIt_vec # Plotting Parameters global xtick_lab global xtick_labels global ytick_lab global xmax global xmin global ymin global ymax global labels global facecolor_vec global draw_std_vec global linestyle_vec global linewidth_vec global marker_vec global markersize_vec global legend_location global option_vec global learning_method_vec global Macro_Accuracy global EC global constraints global weight_vec global randomize_vec global k global err global avoidNeighbors global convergencePercentage_W global stratified global gradient global doubly_stochastic global num_restarts global numberOfSplits global H_heuristic global select_lambda_vec global lambda_vec global f_vec # -- Default Graph parameters if choice == 0: None elif choice == 304: ## with varying weights FILENAMEZ = 'prop37' Macro_Accuracy = True gradient = True fig_label = 'Prop37' legend_location = 'lower right' n = 62000 d = 34.8 select_lambda_vec = [False] * 5 f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] elif choice == 305: # DCEr Only experiment choose(605) choose(304) select_lambda_vec = [False] * 6 elif choice == 306: choose(304) select_lambda_vec = [False] * 3 + [True] * 3 lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec learning_method_vec.append('Holdout') labels.append('Holdout') elif choice == 307: # heuristic comparison choose(304) select_lambda_vec = [False] * 3 + [True] * 3 lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec learning_method_vec.append('Heuristic') labels.append('Heuristic') H_heuristic = np.array([[.476, .0476, .476], [.476, .0476, .476], [.476, .476, .0476]]) # -- MovieLens dataset elif choice == 401: FILENAMEZ = 'movielens' Macro_Accuracy = True gradient = True fig_label = 'MovieLens' legend_location = 'upper left' n = 26850 d = 25.0832029795 elif choice == 402: choose(401) select_lambda_vec = [False] * 3 + [ True ] * 3 # allow to choose lambda for different f in f_vec lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 403: choose(402) ymin = 0.3 ymax = 1.0 learning_method_vec.append('Holdout') labels.append('Holdout') elif choice == 404: choose(401) select_lambda_vec = [ True ] * 3 # allow to choose lambda for different f in f_vec lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec labels = ['GS', 'DCEr', 'Homophily'] facecolor_vec = ['black', "#C44E52", "#64B5CD"] draw_std_vec = [False, True, False] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 2, 2, 2, 2] marker_vec = [None, '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] weight_vec = [None, 10, None] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] randomize_vec = [False, True, False] learning_method_vec = ['GT', 'DHE'] #TODO elif choice == 405: # DCEr ONLY experiment choose(605) choose(401) learning_method_vec += ['Holdout'] labels += ['Holdout'] elif choice == 406: # comparison with a static heuristic matrix choose(402) learning_method_vec += ['Heuristic'] labels += ['Heuristic'] H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476], [.476, .476, .0476]]) elif choice == 407: choose(402) ymin = 0.3 ymax = 1.0 lambda_vec = [1] * 21 # same length as f_vec elif choice == 408: choose(402) ymin = 0.3 ymax = 1.0 lambda_vec = [10] * 21 # same length as f_vec # DO NOT RUN WITH CREATE_DATA=True, if you do please restore the data from # data/sigmod-movielens-fig.csv elif choice == 409: choose(402) facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#8172B2", "#C44E52", "#C44E52", "#CCB974", "#64B5CD" ] labels = [ 'GS', 'LCE', 'MCE', 'DCE1', 'DCE10', 'DCEr1', 'DCEr10', 'Holdout' ] draw_std_vec = [False] * 5 + [True] * 2 + [False] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [2, 2, 2, 2, 2, 2, 2, 2] marker_vec = [None, 'o', 'x', 's', 'p', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8, 8] option_vec = [ 'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8' ] legend_location = 'upper left' ymin = 0.3 ymax = 1.0 lambda_vec = [10] * 21 # same length as f_vec # -- Yelp dataset elif choice == 501: FILENAMEZ = 'yelp' Macro_Accuracy = True weight_vec = [None] * 3 + [10, 10] gradient = True ymin = 0.1 ymax = 0.75 fig_label = 'Yelp' legend_location = 'upper left' n = 4301900 # for figure d = 6.56 # for figure # -- Flickr dataset #elif choice == 601: # FILENAMEZ = 'flickr' # Macro_Accuracy = True # fig_label = 'Flickr' # legend_location = 'lower right' # ymin = 0.3 # ymax = 0.7 # n = 2007369 # d = 18.1 #elif choice == 602: ## with varying weights # choose(601) # select_lambda_vec = [False] * 4 + [True]*2 # allow to choose lambda for different f in f_vec # f_vec = [0.9 * pow(0.1, 1 / 5) ** x for x in range(21)] # lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec #elif choice == 603: ## with varying weights # choose(602) # select_lambda_vec = [False] * 3 + [True] * 2 # allow to choose lambda for different f in f_vec # # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6 # same length as f_vec #elif choice == 604: ## with weight = 1 # choose(603) # lambda_vec = [0.5] * 21 # same length as f_vec # -- Flickr dataset elif choice == 601: FILENAMEZ = 'flickr' Macro_Accuracy = True fig_label = 'Flickr' legend_location = 'lower right' n = 2007369 d = 18.1 elif choice == 602: ## with varying weights choose(601) select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 603: ## with varying weights choose(602) select_lambda_vec = [False] * 3 + [ True ] * 2 # allow to choose lambda for different f in f_vec # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6 # same length as f_vec elif choice == 604: ## with weight = 1 draw_std_vec = [4] choose(603) lambda_vec = [0.5] * 21 # same length as f_vec elif choice == 605: choose(601) facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD", 'orange' ] draw_std_vec = [False] + [True] * 10 linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [3] * 10 marker_vec = [None, 'o', 'x', '^', 'v', '+', 'o', 'x'] markersize_vec = [0] + [8] * 10 randomize_vec = [True] * 8 option_vec = [ 'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8' ] learning_method_vec = [ 'GT', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE' ] select_lambda_vec = [False] * 8 f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec weight_vec = [0, 0, 1, 2, 5, 10, 15] labels = ['GT'] + [ i + ' {}'.format(weight_vec[ix]) for ix, i in enumerate(['DCEr'] * 6) ] elif choice == 606: # heuristic experiment choose(602) labels.append('Heuristic') learning_method_vec.append('Heuristic') H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476], [.476, .476, .0476]]) # -- DBLP dataset elif choice == 701: FILENAMEZ = 'dblp' Macro_Accuracy = True ymin = 0.2 ymax = 0.5 fig_label = 'DBLP' legend_location = 'lower right' n = 2241258 # for figure d = 26.11 # for figure # -- ENRON dataset elif choice == 801: FILENAMEZ = 'enron' Macro_Accuracy = True ymin = 0.3 ymax = 0.75 fig_label = 'Enron' f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] legend_location = 'upper left' n = 46463 # for figures d = 23.4 # for figures elif choice == 802: ### WITH ADAPTIVE WEIGHTS choose(801) select_lambda_vec = [False] * 4 + [ True ] * 2 # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 803: ### WITH ADAPTIVE WEIGHTS choose(802) lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [ 1 ] * 6 # same length as f_vec elif choice == 804: choose(803) elif choice == 805: choose(605) choose(801) #learning_method_vec += ['Holdout'] #labels += ['Holdout'] elif choice == 806: # Heuristic experiment choose(802) learning_method_vec += ['Heuristic'] labels += ['Heuristic'] H_heuristic = np.array([[0.76, 0.08, 0.08, 0.08], [0.08, 0.08, 0.76, 0.08], [0.08, 0.76, 0.08, 0.76], [0.08, 0.08, 0.76, 0.08]]) elif choice == 821: FILENAMEZ = 'enron' Macro_Accuracy = True constraints = True # True gradient = True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [0.2, 0.2] randomize_vec = [False] * 4 + [True] xmin = 0.0001 ymin = 0.0 ymax = 0.7 labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Enron' legend_location = 'lower right' n = 46463 # for figures d = 23.4 # for figures alpha = 0.0 beta = 0.0 gamma = 0.0 s = 0.5 numMaxIt = 10 select_lambda_vec = [False] * 3 + [True] * 2 lambda_vec = [0.2] * 13 + [10] * 8 # same length as f_vec # -- Cora dataset elif choice == 901: FILENAMEZ = 'cora' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.001 ymin = 0.0 ymax = 0.9 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Cora' legend_location = 'lower right' n = 2708 d = 7.8 # -- Citeseer dataset elif CHOICE == 1001: FILENAMEZ = 'citeseer' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.001 ymin = 0.0 ymax = 0.75 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Citeseer' legend_location = 'lower right' n = 3312 d = 5.6 elif CHOICE == 1101: FILENAMEZ = 'hep-th' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.0001 ymin = 0.0 ymax = 0.1 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Hep-th' legend_location = 'lower right' n = 27770 d = 5.6 elif choice == 1102: choose(1101) Macro_Accuracy = True elif CHOICE == 1204: FILENAMEZ = 'pokec-gender' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.000015 ymin = 0.0 ymax = 0.75 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [0, 3, 4, 4, 4, 4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Pokec-Gender' legend_location = 'lower right' n = 1632803 d = 54.6 else: raise Warning("Incorrect choice!") for choice in experiments: choose(choice) filename = 'Fig_End-to-End_accuracy_realData_{}_{}'.format( choice, FILENAMEZ) csv_filename = '{}.csv'.format(filename) header = [ 'currenttime', 'method', 'f', 'accuracy', 'precision', 'recall', 'learntime', 'proptime' ] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # print("choice: {}".format(choice)) # --- print data statistics if CALCULATE_DATA_STATISTICS: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) d = (len(W.nonzero()[0]) * 2) / n k = len(X0[0]) print("FILENAMEZ:", FILENAMEZ) print("k:", k) print("n:", n) print("d:", d) # -- Graph statistics n_vec = calculate_nVec_from_Xd(Xd) print("n_vec:\n", n_vec) d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd) print("d_vec:\n", d_vec) P = calculate_Ptot_from_graph(W, Xd) print("P:\n", P) for i in range(k): Phi = calculate_degree_correlation(W, X0, i, NB=True) print("Degree Correlation, Class {}:\n{}".format(i, Phi)) # -- Various compatibilities H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) print("H0 w/ constraints:\n", np.round(H0, 2)) #raw_input() # Why? H2 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H4 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H5 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H6 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H7 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) print() # print("H MCE w/o constraints:\n", np.round(H0, 3)) print("H MCE w/ constraints:\n", np.round(H2, 3)) # print("H DCE 2 w/o constraints:\n", np.round(H4, 3)) print("H DCE 2 w/ constraints:\n", np.round(H5, 3)) # print("H DCE 10 w/o constraints:\n", np.round(H6, 3)) print("H DCE 20 w/ constraints:\n", np.round(H7, 3)) print() H_row_vec = H_observed(W, X0, 3, NB=True, variant=1) print("H_est_1:\n", np.round(H_row_vec[0], 3)) print("H_est_2:\n", np.round(H_row_vec[1], 3)) print("H_est_3:\n", np.round(H_row_vec[2], 3)) # --- Create data if CREATE_DATA or ADD_DATA: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) ## number of nodes in graph k = len(X0[0]) d = (len(W.nonzero()[0]) * 2) / n #print(n) #print(d) #print("contraint = {}".format(constraints)) #print('select lambda: {}'.format(len(select_lambda_vec))) #print('learning method: {}'.format(len(learning_method_vec))) #print('alpha: {}'.format(len(alpha_vec))) #print('beta: {}'.format(len(beta_vec))) #print('gamma: {}'.format(len(gamma_vec))) #print('s: {}'.format(len(s_vec))) #print('maxit: {}'.format(len(numMaxIt_vec))) #print('weight: {}'.format(len(weight_vec))) #print('randomize: {}'.format(len(randomize_vec))) # --- Calculating True Compatibility matrix H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=constraints, gradient=gradient, doubly_stochastic=doubly_stochastic) # print(H0) H0c = to_centering_beliefs(H0) num_results = len(f_vec) * len(learning_method_vec) * rep_SameGraph # Starts a thread pool with at least 2 threads, and a lot more if you happen to be on a supercomputer pool = multiprocessing.Pool(max(2, multiprocessing.cpu_count() - 4)) f_processes = f_vec * rep_SameGraph workers = [] results = [(X0, W, f, ix) for ix, f in enumerate(f_vec)] * rep_SameGraph # print('Expected results: {}'.format(num_results)) try: # hacky fix due to a bug in 2.7 multiprocessing # Distribute work for evaluating accuracy over the thread pool using # a hacky method due to python 2.7 multiprocessing not being fully # featured pool.map_async(multi_run_wrapper, results).get(num_results * 2) except multiprocessing.TimeoutError as e: continue finally: pool.close() pool.join() # -- Read data for all options and plot df1 = pd.read_csv(join(data_directory, csv_filename)) acc_filename = '{}_accuracy_plot.pdf'.format(filename) pr_filename = '{}_PR_plot.pdf'.format(filename) if TIMING: print('=== {} Timing Results ==='.format(FILENAMEZ)) print('Prop Time:\navg: {}\nstddev: {}'.format( np.average(df1['proptime'].values), np.std(df1['proptime'].values))) for learning_method in labels: rs = df1.loc[df1["method"] == learning_method] avg = np.average(rs['learntime']) std = np.std(rs['learntime']) print('{} Learn Time:\navg: {}\nstd: {}'.format( learning_method, avg, std)) sslhv.plot(df1, join(figure_directory, acc_filename), n=n, d=d, k=k, labels=labels, dataset=FILENAMEZ, line_styles=linestyle_vec, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, marker_sizes=markersize_vec, draw_stds=draw_std_vec, markers=marker_vec, line_colors=facecolor_vec, line_widths=linewidth_vec, legend_location=legend_location, show=SHOW_PDF, save=CREATE_PDF, show_plot=SHOW_PLOT)
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False): global n global d global rep_SameGraph global FILENAMEZ global initial_h0 global H0c global exponent global length global variant global alpha_vec global beta_vec global gamma_vec global s_vec global clip_on_vec global numMaxIt_vec # Plotting Parameters global xtick_lab global xtick_labels global ytick_lab global xmax global xmin global ymin global ymax global labels global facecolor_vec global draw_std_vec global linestyle_vec global linewidth_vec global marker_vec global markersize_vec global legend_location global option_vec global learning_method_vec global Macro_Accuracy global EC global constraints global weight_vec global randomize_vec global k global fig_label global err global avoidNeighbors global convergencePercentage_W global stratified global gradient global doubly_stochastic global numberOfSplits global select_lambda_vec global lambda_vec global f_vec # -- Setup CHOICE = choice #500 Yelp, 600 Flickr, 700 DBLP, 800 Enron CREATE_DATA = create_data ADD_DATA = add_data SHOW_PDF = show_pdf SHOW_PLOT = show_plot CREATE_PDF = create_pdf STD_FILL = True CALCULATE_DATA_STATISTICS = False # -- Default Graph parameters rep_SameGraph = 3 # iterations on same graph initial_h0 = None # initial vector to start finding optimal H exponent = -0.3 length = 5 variant = 1 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 clip_on_vec = [True] * 10 numMaxIt_vec = [10] * 10 # Plotting Parameters xtick_lab = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1] xtick_labels = ['0.001\%', '0.01\%', '0.1\%', '1\%', '10\%', '100\%'] ytick_lab = np.arange(0, 1.1, 0.1) xmax = 1 xmin = 0.0001 ymin = 0.3 ymax = 0.7 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [0, 3, 4, 4, 4, 4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] Macro_Accuracy = False EC = True # Non-backtracking for learning constraints = True # True weight_vec = [None] * 3 + [10, 10] randomize_vec = [False] * 4 + [True] k = 3 err = 0 avoidNeighbors = False convergencePercentage_W = None stratified = True gradient = True doubly_stochastic = True draw_std_vec = range(10) numberOfSplits = 1 select_lambda_vec = [False] * 20 lambda_vec = None f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] FILENAMEZ = "" legend_location = "" fig_label = "" global exp_backoff exp_backoff = [2**n for n in range(6, 12)] def choose(choice): # -- Default Graph parameters global n global d global rep_SameGraph global FILENAMEZ global initial_h0 global exponent global length global variant global alpha_vec global beta_vec global gamma_vec global s_vec global clip_on_vec global numMaxIt_vec # Plotting Parameters global xtick_lab global xtick_labels global ytick_lab global xmax global xmin global ymin global ymax global labels global facecolor_vec global draw_std_vec global linestyle_vec global linewidth_vec global marker_vec global markersize_vec global legend_location global option_vec global learning_method_vec global Macro_Accuracy global EC global constraints global weight_vec global randomize_vec global k global fig_label global err global avoidNeighbors global convergencePercentage_W global stratified global gradient global doubly_stochastic global numberOfSplits global select_lambda_vec global lambda_vec global f_vec if choice == 0: None elif choice == 304: ## with varying weights FILENAMEZ = 'prop37' Macro_Accuracy = True fig_label = 'Prop37' legend_location = 'lower right' n = 62000 d = 34.8 select_lambda_vec = [False] * 5 # select_lambda_vec = [False] * 3 + [True] * 2 # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] # lambda_vec = [0.5] * 21 # same length as f_vec elif choice == 305: # Test row stochastic cases choose(304) doubly_stochastic = False # -- Yelp dataset elif choice == 501: FILENAMEZ = 'yelp' Macro_Accuracy = True weight_vec = [None] * 3 + [10, 10] gradient = True ymin = 0.1 ymax = 0.75 fig_label = 'Yelp' legend_location = 'upper left' n = 4301900 # for figure d = 6.56 # for figure # -- Flickr dataset elif choice == 601: FILENAMEZ = 'flickr' Macro_Accuracy = True fig_label = 'Flickr' legend_location = 'lower right' n = 2007369 d = 18.1 elif choice == 602: ## with varying weights choose(601) select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 603: ## with varying weights choose(602) select_lambda_vec = [False] * 3 + [ True ] * 2 # allow to choose lambda for different f in f_vec # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6 # same length as f_vec elif choice == 604: ## with weight = 1 draw_std_vec = [4] choose(603) lambda_vec = [0.5] * 21 # same length as f_vec # -- DBLP dataset elif choice == 701: FILENAMEZ = 'dblp.txt' Macro_Accuracy = True ymin = 0.2 ymax = 0.5 fig_label = 'DBLP' legend_location = 'lower right' n = 2241258 # for figure d = 26.11 # for figure # -- ENRON dataset elif choice == 801: FILENAMEZ = 'enron' Macro_Accuracy = True ymin = 0.3 ymax = 0.75 fig_label = 'Enron' f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] legend_location = 'upper left' n = 46463 # for figures d = 23.4 # for figures elif choice == 802: ### WITH ADAPTIVE WEIGHTS choose(801) select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 803: ### WITH ADAPTIVE WEIGHTS choose(802) lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [ 1 ] * 6 # same length as f_vec elif choice == 804: choose(803) elif choice == 805: choose(801) doubly_stochastic = False elif choice == 821: FILENAMEZ = 'enron' Macro_Accuracy = True constraints = True # True gradient = True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [0.2, 0.2] randomize_vec = [False] * 4 + [True] xmin = 0.0001 ymin = 0.0 ymax = 0.7 labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Enron' legend_location = 'lower right' n = 46463 # for figures d = 23.4 # for figures alpha = 0.0 beta = 0.0 gamma = 0.0 s = 0.5 numMaxIt = 10 select_lambda_vec = [False] * 3 + [True] * 2 lambda_vec = [0.2] * 13 + [10] * 8 # same length as f_vec captionText = "DCE weight=[0.2*13] [10*8], s={}, numMaxIt={}".format( s, numMaxIt) # -- Cora dataset elif choice == 901: FILENAMEZ = 'cora' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.001 ymin = 0.0 ymax = 0.9 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Cora' legend_location = 'lower right' n = 2708 d = 7.8 # -- Citeseer dataset elif CHOICE == 1001: FILENAMEZ = 'citeseer' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.001 ymin = 0.0 ymax = 0.75 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Citeseer' legend_location = 'lower right' n = 3312 d = 5.6 elif CHOICE == 1101: FILENAMEZ = 'hep-th' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.0001 ymin = 0.0 ymax = 0.1 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Hep-th' legend_location = 'lower right' n = 27770 d = 5.6 elif CHOICE == 1204: FILENAMEZ = 'pokec-gender' Macro_Accuracy = True constraints = True # True option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] weight_vec = [None] * 3 + [10, 10] numMaxIt_vec = [10] * 10 randomize_vec = [False] * 4 + [True] gradient = True xmin = 0.000015 ymin = 0.0 ymax = 0.75 labels = ['GT', 'LCE', 'MCE', 'DCE', 'DCE r'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [0, 3, 4, 4, 4, 4] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] fig_label = 'Pokec-Gender' legend_location = 'lower right' n = 1632803 d = 54.6 else: raise Warning("Incorrect choice!") choose(CHOICE) csv_filename = 'Fig_End-to-End_accuracy_{}_{}.csv'.format( CHOICE, FILENAMEZ) header = ['currenttime', 'method', 'f', 'precision', 'recall', 'accuracy'] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # print("choice: {}".format(CHOICE)) # --- print data statistics if CALCULATE_DATA_STATISTICS: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) d = (len(W.nonzero()[0]) * 2) / n print("FILENAMEZ:", FILENAMEZ) print("n:", n) print("d:", d) # -- Graph statistics n_vec = calculate_nVec_from_Xd(Xd) print("n_vec:\n", n_vec) d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd) print("d_vec:\n", d_vec) P = calculate_Ptot_from_graph(W, Xd) print("P:\n", P) # -- Various compatibilities H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) print("H0 w/ constraints:\n", np.round(H0, 2)) raw_input() H2 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H4 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H5 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H6 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H7 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) # print("H MCE w/o constraints:\n", np.round(H0, 3)) print("H MCE w/ constraints:\n", np.round(H2, 3)) # print("H DCE 2 w/o constraints:\n", np.round(H4, 3)) print("H DCE 2 w/ constraints:\n", np.round(H5, 3)) # print("H DCE 10 w/o constraints:\n", np.round(H6, 3)) print("H DCE 20 w/ constraints:\n", np.round(H7, 3)) H_row_vec = H_observed(W, X0, 3, NB=True, variant=1) print("H_est_1:\n", np.round(H_row_vec[0], 3)) print("H_est_2:\n", np.round(H_row_vec[1], 3)) print("H_est_3:\n", np.round(H_row_vec[2], 3)) # --- Create data if CREATE_DATA or ADD_DATA: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) ## number of nodes in graph d = (len(W.nonzero()[0]) * 2) / n # print(n) # print(d) # print("contraint = {}".format(constraints)) # --- Calculating True Compatibility matrix H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=constraints, gradient=gradient, doubly_stochastic=doubly_stochastic) # print(H0) H0c = to_centering_beliefs(H0) graph_workers = [] gq = multiprocessing.Queue() for j in range(rep_SameGraph): # repeat several times for same graph # print("Graph: {}".format(j)) graph_workers.append( multiprocessing.Process(target=graph_worker, args=(X0, W, gq))) for gw in graph_workers: gw.start() for gw in graph_workers: for t in exp_backoff: gw.join(t) if gw.exitcode is None: print( "failed to join graph worker {} after {} seconds, retrying" .format(gw, t)) else: continue print("Failed to join graph worker {}.".format(gw)) gq.put('STOP') for i in iter(gq.get, 'STOP'): save_csv_record(join(data_directory, csv_filename), i) # -- Read, aggregate, and pivot data for all options df1 = pd.read_csv(join(data_directory, csv_filename)) acc_filename = 'Fig_End-to-End_accuracy_realData{}_{}.pdf'.format( CHOICE, FILENAMEZ) pr_filename = 'Fig_End-to-End_PR_realData{}_{}.pdf'.format( CHOICE, FILENAMEZ) # generate_figure(data_directory, acc_filename, df1) # generate_figure(data_directory, pr_filename, df1, metric='pr') # print("\n-- df1: (length {}):\n{}".format(len(df1.index), df1.head(5))) # Aggregate repetitions if "option" in df1.columns.values: pivot_col = "option" pivot_vec = option_vec else: pivot_col = "method" pivot_vec = learning_method_vec df2 = df1.groupby([pivot_col, 'f']).agg \ ({'accuracy': [np.mean, np.std, np.size], # Multiple Aggregates }) df2.columns = ['_'.join(col).strip() for col in df2.columns.values ] # flatten the column hierarchy df2.reset_index(inplace=True) # remove the index hierarchy df2.rename(columns={'accuracy_size': 'count'}, inplace=True) # print("\n-- df2 (length {}):\n{}".format(len(df2.index), df2.head(500))) # Pivot table df3 = pd.pivot_table(df2, index='f', columns=pivot_col, values=['accuracy_mean', 'accuracy_std']) # Pivot # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(30))) df3.columns = ['_'.join(col).strip() for col in df3.columns.values ] # flatten the column hierarchy df3.reset_index(inplace=True) # remove the index hierarchy # df2.rename(columns={'time_size': 'count'}, inplace=True) # print("\n-- df3 (length {}):\n{}".format(len(df3.index), df3.head(5))) # Extract values X_f = df3['f'].values # plot x values Y = [] Y_std = [] for val in pivot_vec: Y.append(df3['accuracy_mean_{}'.format(val)].values) if STD_FILL: Y_std.append(df3['accuracy_std_{}'.format(val)].values) if CREATE_PDF or SHOW_PDF or SHOW_PLOT: print("Setting up figure...") # -- Setup figure # remove 4 last characters ".txt" fig_filename = 'Fig_End-to-End_accuracy_realData{}_{}.pdf'.format( CHOICE, FILENAMEZ) mpl.rc( 'font', **{ 'family': 'sans-serif', 'sans-serif': [u'Arial', u'Liberation Sans'] }) mpl.rcParams['axes.labelsize'] = 20 mpl.rcParams['xtick.labelsize'] = 16 mpl.rcParams['ytick.labelsize'] = 16 mpl.rcParams['legend.fontsize'] = 14 # 6 mpl.rcParams['grid.color'] = '777777' # grid color mpl.rcParams[ 'xtick.major.pad'] = 2 # padding of tick labels: default = 4 mpl.rcParams[ 'ytick.major.pad'] = 1 # padding of tick labels: default = 4 mpl.rcParams['xtick.direction'] = 'out' # default: 'in' mpl.rcParams['ytick.direction'] = 'out' # default: 'in' mpl.rcParams['axes.titlesize'] = 16 mpl.rcParams['figure.figsize'] = [4, 4] fig = figure() ax = fig.add_axes([0.13, 0.17, 0.8, 0.8]) # -- Drawing if STD_FILL: for choice, (option, facecolor) in enumerate(zip(option_vec, facecolor_vec)): if choice in draw_std_vec: ax.fill_between(X_f, Y[choice] + Y_std[choice], Y[choice] - Y_std[choice], facecolor=facecolor, alpha=0.2, edgecolor=None, linewidth=0) ax.plot(X_f, Y[choice] + Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid') ax.plot(X_f, Y[choice] - Y_std[choice], linewidth=0.5, color='0.8', linestyle='solid') for choice, (option, label, color, linewidth, clip_on, linestyle, marker, markersize) in \ enumerate(zip(option_vec, labels, facecolor_vec, linewidth_vec, clip_on_vec, linestyle_vec, marker_vec, markersize_vec)): ax.plot(X_f, Y[choice], linewidth=linewidth, color=color, linestyle=linestyle, label=label, zorder=4, marker=marker, markersize=markersize, markeredgewidth=1, clip_on=clip_on) # -- Title and legend if n < 1000: n_label = '{}'.format(n) else: n_label = '{}k'.format(int(n / 1000)) title(r'$\!\!\!\!\!\!\!${}: $n={}, d={}$'.format( fig_label, n_label, np.round(d, 1))) handles, labels = ax.get_legend_handles_labels() legend = plt.legend( handles, labels, loc=legend_location, # 'upper right' handlelength=2, labelspacing=0, # distance between label entries handletextpad= 0.3, # distance between label and the line representation # title='Variants', borderaxespad=0.2, # distance between legend and the outer axes borderpad=0.3, # padding inside legend box numpoints=1, # put the marker only once ) # # legend.set_zorder(1) frame = legend.get_frame() frame.set_linewidth(0.0) frame.set_alpha(0.9) # 0.8 plt.xscale('log') # -- Figure settings and save plt.xticks(xtick_lab, xtick_labels) plt.yticks(ytick_lab, ytick_lab) # Only show ticks on the left and bottom spines ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.1f')) grid(b=True, which='major', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', grid(b=True, which='minor', axis='both', alpha=0.2, linestyle='solid', linewidth=0.5) # linestyle='dashed', which='minor', axis='y', xlabel(r'Label Sparsity $(f)$', labelpad=0) # labelpad=0 ylabel(r'Accuracy', labelpad=0) xlim(xmin, xmax) ylim(ymin, ymax) if CREATE_PDF: print("saving PDF of figure...") savefig(join(figure_directory, fig_filename), format='pdf', dpi=None, edgecolor='w', orientation='portrait', transparent=False, bbox_inches='tight', pad_inches=0.05, frameon=None) if SHOW_PLOT: print("Showing plot...") plt.show() if SHOW_PDF: print("Showing pdf...") showfig(join(figure_directory, fig_filename)) # shows actually created PDF
def test_smallMotivatingGraph_statistics(): # 'create_blocked_matrix_from_graph()', 'test_calculate_Ptot_from_graph()' 'calculate_outdegree_distribution_from_graph()', 'calculate_average_outdegree_from_graph()' # uses motivation example with 15 nodes from VLDB introduction. # Weighs edges to see the affinities better in blocked matrix # create_blocked_matrix_from_graph, # test_calculate_Ptot_from_graph, # calculate_outdegree_distribution_from_graph, # calculate_average_outdegree_from_graph # VERSION = 'directed' # 1: directed VERSION = 'undirected' # 2: undirected print("\n--- Example graph from VLDB slides ---") Xd = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 1, 6: 1, 7: 1, 8: 1, 9: 1, 10: 2, 11: 2, 12: 2, 13: 2, 14: 2, # 15: 2 # length of dictionary need to be = number of nodes in edges } # # Original VLDB drawing # row = [0, 2, 0, 1, 1, 2, 2, 2, 3, 4, 3, 4, 6, 7, 8, 9, 10, 10, 10, 11, 11, 11, 12, 13,] # col = [1, 3, 5, 8, 9, 6, 7, 8, 9, 5, 11, 12, 8, 9, 10, 14, 11, 12, 14, 12, 13, 14, 14, 14,] # weight = [1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, ] # # Corrected undirected graph with correct P_tot [2, 8, 2] # row = [0, 0, 1, 1, 2, 2, 2, 3, 4, 3, 4, 6, 8, 9, 10, 11, 12, 13,] # col = [1, 5, 8, 9, 6, 7, 8, 9, 5, 11, 12, 8, 10, 14, 11, 13, 14, 14,] # weight = [1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 6, 6, ] # # Corrected undirected graph with correct P_tot [2, 6, 2] row = [0, 0, 1, 1, 2, 2, 3, 3, 4, 6, 8, 9, 10, 12, 13,] col = [1, 5, 8, 9, 6, 7, 9, 11, 12, 8,10, 14, 11, 14, 14,] weight = [1, 2, 2, 2, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 6, ] print("Xd:", Xd) if VERSION == 'undirected': weight = weight + weight row, col = row + col, col + row # !!! assignment at same time: same line print("row:", row) print("col:", col) print("weight:", weight, "\n") print("- Random permutation of node ids:") ranks = np.random.permutation(len(Xd)) # ranks is the new mapping vector print("ranks:", ranks) row2 = ranks[row] # !!! mapping col2 = ranks[col] print("row2:", row2.tolist()) # list plots nicer than np.array print("col2:", col2.tolist()) print("weight:", weight) W_rand = csr_matrix((weight, (row2, col2)), shape=(15, 15)) nodes = np.array(list(Xd.keys())) nodes2 = ranks[nodes] print("nodes: ", nodes) print("nodes2: ", nodes2) classes = np.array(list(Xd.values())) # Python 3 requires list(dict.keys()), and also for values print("classes: ", classes) Xd_rand = dict(zip(ranks[nodes], classes)) print("Xd_rand: {}".format(Xd_rand)) print("W_rand:\n{}".format(W_rand.todense())) print("\n- 'create_blocked_matrix_from_graph():' ") W_block, Xd_new = create_blocked_matrix_from_graph(W_rand, Xd_rand) W = W_block Xd = Xd_new print("W:\n{}".format(W.todense())) print("\n- 'test_calculate_Ptot_from_graph():' ") W2 = csr_matrix(W, copy=True) W2.data[:] = np.sign(W2.data) # W contains weighted edges -> unweighted before counting edges with Ptot Ptot = calculate_Ptot_from_graph(W2, Xd) print("Ptot:\n{}".format(Ptot)) print("\n- 'test_calculate_nVec_from_Xd():' ") n_vec = calculate_nVec_from_Xd(Xd) print("n_vec: {}".format(n_vec)) print("\n- 'calculate_outdegree_distribution_from_graph():' ") print("Outdegree distribution: {}".format( calculate_outdegree_distribution_from_graph(W, Xd=None) )) # print ("Outdegree distribution: {}".format( sorted(calculate_outdegree_distribution_from_graph(W, Xd=None).items()) )) print("Outdegree distribution per class: {}".format( calculate_outdegree_distribution_from_graph(W, Xd) )) print("Indegree distribution: {}".format( calculate_outdegree_distribution_from_graph(W.transpose(), Xd=None) )) print("Indegree distribution per class: {}".format(calculate_outdegree_distribution_from_graph(W.transpose(), Xd))) print("\n- 'calculate_average_outdegree_from_graph():' ") print("Average outdegree: {}".format(calculate_average_outdegree_from_graph(W, Xd=None))) print("Average outdegree per class: {}".format(calculate_average_outdegree_from_graph(W, Xd))) print("Average indegree: {}".format(calculate_average_outdegree_from_graph(W.transpose(), Xd=None))) print("Average indegree per class: {}".format(calculate_average_outdegree_from_graph(W.transpose(), Xd))) print("\n- Visualize adjacency matrix") plt.matshow(W.todense(), fignum=100, cmap=plt.cm.Greys) # cmap=plt.cm.gray / Blues plt.xticks([4.5, 9.5]) plt.yticks([4.5, 9.5]) plt.grid(which='major') frame = plt.gca() frame.axes.xaxis.set_ticklabels([]) frame.axes.yaxis.set_ticklabels([]) plt.savefig('figs/Fig_test_calculate_Ptot_from_graph.png') os.system('open "figs/Fig_test_calculate_Ptot_from_graph.png"')
def test_calculate_nVec_from_Xd(): print("\n--- 'calculate_nVec_from_Xd(Xd):' ---") # Xd = {'n1': 1, 'n2' : 2, 3: 3, 4: 1, 5: 0, 6: 0, 7:0} # Python 2 allowed comparing str and int, not anymore in Python 3 Xd = {1: 1, 2: 2, 3: 3, 4: 1, 5: 0, 6: 0, 7: 0} print("Xd: {}".format(Xd)) print("Result: {}".format(calculate_nVec_from_Xd(Xd)))
def test_planted_distribution_model(): """ Tests the main graph generator with statistics and visualized degree distribution and edge adjacency matrix """ print("\n--- 'planted_distribution_model_H', 'planted_distribution_model_P', 'number_of_connectedComponents', 'create_blocked_matrix_from_graph' --") CHOICE = 21 print("CHOICE:", CHOICE) debug = 0 # directed = True # !!! TODO: not yet clear what undirected means here, only P accepts directed backEdgesAllowed = True # ??? should be enforced in code sameInAsOutDegreeRanking = False distribution = 'powerlaw' exponent = -0.3 VERSION_P = True # --- AAAI figures --- if CHOICE in [1, 2, 3, 4, 5, 6]: n = 120 alpha0 = [1/6, 1/3, 1/2] h = 8 P = np.array([[1, h, 1], [1, 1, h], [h, 1, 1]]) if CHOICE == 1: # P (equivalent to 2), AAAI 2 m = 1080 elif CHOICE == 2: # H (equivalent to 1) H0 = row_normalize_matrix(P) d_vec = [18, 9, 6] VERSION_P = False elif CHOICE == 3: # H (equivalent to 4), AAAI 3 H0 = row_normalize_matrix(P) d_vec = 9 VERSION_P = False elif CHOICE == 4: # P (equivalent to 3) P = np.array([[1, h, 1], [2, 2, 2*h], [3*h, 3, 3]]) m = 1080 elif CHOICE == 5: # H (equivalent to 2), but backedges=False H0 = row_normalize_matrix(P) d_vec = [18, 9, 6] VERSION_P = False backEdgesAllowed = False elif CHOICE == 6: # P undirected, AAAI 4 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) directed = False backEdgesAllowed = False m = 540 # --- AGAIN DIRECTED --- if CHOICE == 12: n = 1001 alpha0 = [0.6, 0.2, 0.2] P = np.array([[0.1, 0.8, 0.1], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8]]) m = 3000 distribution = 'uniform' # uniform powerlaw exponent = None backEdgesAllowed = False # ??? should be enforced in code if CHOICE == 13: # Nice for block matrix visualization n = 1000 alpha0 = [0.334, 0.333, 0.333] h = 2 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) m = 2000 distribution = 'uniform' # uniform powerlaw exponent = None backEdgesAllowed = False # ??? should be enforced in code if CHOICE == 14: n = 1000 alpha0 = [0.3334, 0.3333, 0.3333] h = 10 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) m = 10000 exponent = -0.55 # --- UNDIRECTED --- if CHOICE == 20: n = 100 alpha0 = [0.6, 0.2, 0.2] h = 1.4 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) H0 = row_normalize_matrix(P) d_vec = 5 directed = False exponent = -0.3 VERSION_P = False elif CHOICE == 21: n = 1001 alpha0 = [0.6, 0.2, 0.2] h = 4 P = np.array([[1, h, 1], [h, 1, 1], [1, 1, h]]) H0 = row_normalize_matrix(P) d_vec = 3.4 # don't specify vector for undirected distribution = 'uniform' # uniform powerlaw exponent = -0.5 directed = False backEdgesAllowed = True # ignored in code for undirected VERSION_P = False sameInAsOutDegreeRanking = True # ignored in code for undirected elif CHOICE == 22: n = 1000 m = 3000 alpha0 = [0.6, 0.2, 0.2] h = 4 P = np.array([[1, 3*h, 1], [2*h, 1, 1], [1, 1, h]]) distribution = 'uniform' # uniform powerlaw exponent = -0.5 directed = False backEdgesAllowed = False # ignored in code for undirected sameInAsOutDegreeRanking = True # ignored in code for undirected debug=0 VERSION_P = True H0 = row_normalize_matrix(P) # --- Create the graph start = time.time() if VERSION_P: W, Xd = planted_distribution_model(n, alpha=alpha0, P=P, m=m, distribution=distribution, exponent=exponent, directed=directed, backEdgesAllowed=backEdgesAllowed, sameInAsOutDegreeRanking=sameInAsOutDegreeRanking, debug=debug) else: W, Xd = planted_distribution_model_H(n, alpha=alpha0, H=H0, d_out=d_vec, distribution=distribution, exponent=exponent, directed=directed, backEdgesAllowed=backEdgesAllowed, sameInAsOutDegreeRanking=sameInAsOutDegreeRanking, debug=debug) time_est = time.time()-start print("Time for graph generation: {}".format(time_est)) # - Undirectd degrees: In + Out W_und = W.multiply(W.transpose()) """if backEdgesAllowed then there can be edges in both directions.""" # W_und.data[:] = np.sign(W_und.data) # W contains weighted edges -> unweighted before counting edges with Ptot print("Fraction of edges that go in both directions: {}".format(np.sum(W_und.data) / np.sum(W.data))) # --- Statistics on created graph print("\n- 'calculate_Ptot_from_graph':") P_tot = calculate_Ptot_from_graph(W, Xd) print("P_tot:\n{}".format(P_tot)) print("sum(P_tot): {}".format(np.sum(P_tot))) print("P (normalized to sum=1):\n{}".format(1. * P_tot / np.sum(P_tot))) # Potential: normalized sum = 1 H = row_normalize_matrix(P_tot) print("H (row-normalized):\n{}".format(H)) print("\n- 'calculate_nVec_from_Xd':") n_vec = calculate_nVec_from_Xd(Xd) print("n_vec: {}".format(n_vec)) print("alpha: {}".format(1.*n_vec / sum(n_vec))) print("\n- Average Out/Indegree 'calculate_average_outdegree_from_graph' (assumes directed for total; for undirected the totals are incorrect):") print("Average outdegree: {}".format(calculate_average_outdegree_from_graph(W))) print("Average indegree: {}".format(calculate_average_outdegree_from_graph(W.transpose()))) print("Average total degree: {}".format(calculate_average_outdegree_from_graph(W + W.transpose()))) print("Average outdegree per class: {}".format(calculate_average_outdegree_from_graph(W, Xd))) print("Average indegree per class: {}".format(calculate_average_outdegree_from_graph(W.transpose(), Xd))) print("Average total degree per class: {}".format(calculate_average_outdegree_from_graph(W + W.transpose(), Xd))) # - Overall degree distribution: In / out print("\n- Overall Out/In/Total degree distribution 'calculate_outdegree_distribution_from_graph':") print("Overall Out and Indegree distribution:") d_out_vec_tot = calculate_outdegree_distribution_from_graph(W, Xd=None) d_in_vec_tot = calculate_outdegree_distribution_from_graph(W.transpose(), Xd=None) print("Outdegree distribution (degree / number):\n{}".format(np.array([d_out_vec_tot.keys(), d_out_vec_tot.values()]))) print("Indegree distribution (degree / number):\n{}".format(np.array([d_in_vec_tot.keys(), d_in_vec_tot.values()]))) # - Overall degree distribution: In + Out d_tot_vec_tot = calculate_outdegree_distribution_from_graph(W + W.transpose(), Xd=None) print("Total degree distribution (degree / number):\n{}".format(np.array([d_tot_vec_tot.keys(), d_tot_vec_tot.values()]))) # - Per-class degree distribution: In / out print("\n- Per-class Out/In/Total degree distribution 'calculate_outdegree_distribution_from_graph':") print("\nOutdegree distribution per class:") d_out_vec = calculate_outdegree_distribution_from_graph(W, Xd) for i in range(len(d_out_vec)): print("Class {}:".format(i)) print(np.array([d_out_vec[i].keys(), d_out_vec[i].values()])) print("Indegree distribution per class:") d_in_vec = calculate_outdegree_distribution_from_graph(W.transpose(), Xd) for i in range(len(d_in_vec)): print("Class {}:".format(i)) print(np.array([d_in_vec[i].keys(), d_in_vec[i].values()])) # - per-class degree distribution: In + out print("\nTotal degree distribution per class:") d_vec_und = calculate_outdegree_distribution_from_graph(W + W.transpose(), Xd) for i in range(len(d_vec_und)): print("Class {}:".format(i)) print(np.array([d_vec_und[i].keys(), d_vec_und[i].values()])) print("\n- number of weakly connected components':") print("Number of weakly connected components: {}".format(connected_components(W, directed=True, connection='weak', return_labels=False))) # --- convergence boundary # print("\n- '_out_eps_convergence_directed_linbp', 'eps_convergence_linbp'") # if directed: # eps_noEcho = _out_eps_convergence_directed_linbp(P, W, echo=False) # eps_Echo = _out_eps_convergence_directed_linbp(P, W, echo=True) # else: Hc = to_centering_beliefs(H) eps_noEcho = eps_convergence_linbp(Hc, W, echo=False) eps_Echo = eps_convergence_linbp(Hc, W, echo=True) print("Eps (w/ echo): {}".format(eps_Echo)) print("Eps (no echo): {}".format(eps_noEcho)) # --- Fig1: Draw edge distributions print("\n- Fig1: Draw degree distributions") params = {'backend': 'pdf', 'lines.linewidth': 4, 'font.size': 10, 'axes.labelsize': 24, # fontsize for x and y labels (was 10) 'axes.titlesize': 22, 'xtick.labelsize': 20, 'ytick.labelsize': 20, 'legend.fontsize': 8, 'figure.figsize': [5, 4], 'font.family': 'sans-serif' } mpl.rcdefaults() mpl.rcParams.update(params) fig = plt.figure(1) ax = fig.add_axes([0.15, 0.15, 0.8, 0.8]) # main axes ax.xaxis.labelpad = -12 ax.yaxis.labelpad = -12 # A: Draw directed degree distribution y_vec = [] for i in range(len(d_out_vec)): y = np.repeat(list(d_out_vec[i].keys()), list(d_out_vec[i].values()) ) # !!! np.repeat y = -np.sort(-y) y_vec.append(y) # print ("Class {}:\n{}".format(i,y)) y_tot = np.repeat(list(d_out_vec_tot.keys()), list(d_out_vec_tot.values())) # total outdegree y_tot = -np.sort(-y_tot) plt.loglog(range(1, len(y_vec[0])+1), y_vec[0], lw=4, color='orange', label=r"A out", linestyle='-') # !!! plot default index starts from 0 otherwise plt.loglog(range(1, len(y_vec[1])+1), y_vec[1], lw=4, color='blue', label=r"B out", linestyle='--') plt.loglog(range(1, len(y_vec[2])+1), y_vec[2], lw=4, color='green', label=r"C out", linestyle=':') plt.loglog(range(1, len(y_tot)+1), y_tot, lw=1, color='black', label=r"tot out", linestyle='-') # B: Draw second edge distribution of undirected degree distribution y_vec = [] for i in range(len(d_vec_und)): y = np.repeat(list(d_vec_und[i].keys()), list(d_vec_und[i].values()) ) # !!! np.repeat y = -np.sort(-y) y_vec.append(y) # print ("Class {}:\n{}".format(i,y)) y_tot = np.repeat(list(d_tot_vec_tot.keys()), list(d_tot_vec_tot.values())) # total outdegree y_tot = -np.sort(-y_tot) plt.loglog(range(1, len(y_vec[0])+1), y_vec[0], lw=4, color='orange', label=r"A", linestyle='-') plt.loglog(range(1, len(y_vec[1])+1), y_vec[1], lw=4, color='blue', label=r"B", linestyle='--') plt.loglog(range(1, len(y_vec[2])+1), y_vec[2], lw=4, color='green', label=r"C", linestyle=':') plt.loglog(range(1, len(y_tot)+1), y_tot, lw=1, color='black', label=r"tot", linestyle='-') plt.legend(loc='upper right', labelspacing=0) filename = 'figs/Fig_test_planted_distribution_model1_{}.pdf'.format(CHOICE) plt.savefig(filename, dpi=None, facecolor='w', edgecolor='w', orientation='portrait', papertype='letter', format='pdf', transparent=True, bbox_inches='tight', pad_inches=0.1, # frameon=None, # TODO: frameon deprecated ) os.system("open " + filename) # --- Fig2: Draw block matrix print("\n- Fig2: 'create_blocked_matrix_from_graph'") W_new, Xd_new = create_blocked_matrix_from_graph(W, Xd) fig = plt.figure(2) row, col = W_new.nonzero() # transform the sparse W back to row col format plt.plot(col, row, 'o', color='r', markersize=2, markeredgewidth=2, lw=0, zorder=3) # Notice (col, row) because first axis is vertical in matrices # plt.matshow(W_new.todense(), cmap=plt.cm.Greys) # cmap=plt.cm.gray / Blues # alternative that does not work as well plt.gca().invert_yaxis() # invert the y-axis to start on top and go down # Show quadrants d1 = alpha0[0] * n d2 = (alpha0[0] + alpha0[1]) * n plt.grid(which='major', color='0.7', linestyle='-', linewidth=1) plt.xticks([0, d1, d2, n]) plt.yticks([0, d1, d2, n]) plt.xlabel('to', labelpad=-1) plt.ylabel('from', rotation=90, labelpad=0) frame = plt.gca() # frame.axes.xaxis.set_ticklabels([]) # would hide the labels # frame.axes.yaxis.set_ticklabels([]) frame.tick_params(direction='inout', width=1, length=10) filename = 'figs/Fig_test_planted_distribution_model2_{}.pdf'.format(CHOICE) plt.savefig(filename, dpi=None, facecolor='w', edgecolor='w', orientation='portrait', papertype='letter', format='pdf', transparent=True, bbox_inches='tight', pad_inches=0.1) os.system("open " + filename)
def run(choice, create_data=False, add_data=False, show_plot=False, create_pdf=False, show_pdf=False): # -- Setup CHOICE = choice #300 Prop37, 400 MovieLens, 500 Yelp, 600 Flickr, 700 DBLP, 800 Enron experiments = [CHOICE] CREATE_DATA = create_data ADD_DATA = add_data SHOW_PDF = show_pdf SHOW_PLOT = show_plot CREATE_PDF = create_pdf SHOW_FIG = SHOW_PLOT or SHOW_PDF or CREATE_PDF STD_FILL = True TIMING = False CALCULATE_DATA_STATISTICS = False # -- Default Graph parameters rep_SameGraph = 10 # iterations on same graph initial_h0 = None # initial vector to start finding optimal H exponent = -0.3 length = 5 variant = 1 alpha_vec = [0] * 10 beta_vec = [0] * 10 gamma_vec = [0] * 10 s_vec = [0.5] * 10 clip_on_vec = [True] * 10 numMaxIt_vec = [10] * 10 # Plotting Parameters xtick_lab = [0.001, 0.01, 0.1, 1] xtick_labels = ['0.1\%', '1\%', '10\%', '100\%'] ytick_lab = np.arange(0, 1.1, 0.1) xmax = 1 xmin = 0.0001 ymin = 0.3 ymax = 0.7 labels = ['GS', 'LCE', 'MCE', 'DCE', 'DCEr'] facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD" ] draw_std_vec = [False] * 4 + [True] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 4, 2, 1, 2, 2] marker_vec = [None, 'o', 'x', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] learning_method_vec = ['GT', 'LHE', 'MHE', 'DHE', 'DHE'] Macro_Accuracy = False EC = True # Non-backtracking for learning constraints = True # True weight_vec = [None] * 3 + [10, 10] * 2 randomize_vec = [False] * 4 + [True] * 2 k = 3 err = 0 avoidNeighbors = False convergencePercentage_W = None stratified = True gradient = True doubly_stochastic = True num_restarts = None raw_std_vec = range(10) numberOfSplits = 1 select_lambda_vec = [False] * 20 lambda_vec = None f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] FILENAMEZ = "" legend_location = "" fig_label = "" H_heuristic = "" def choose(choice): # -- Default Graph parameters nonlocal n nonlocal d nonlocal rep_SameGraph nonlocal FILENAMEZ nonlocal initial_h0 nonlocal exponent nonlocal length nonlocal variant nonlocal alpha_vec nonlocal beta_vec nonlocal gamma_vec nonlocal s_vec nonlocal clip_on_vec nonlocal numMaxIt_vec # Plotting Parameters nonlocal xtick_lab nonlocal xtick_labels nonlocal ytick_lab nonlocal xmax nonlocal xmin nonlocal ymin nonlocal ymax nonlocal labels nonlocal facecolor_vec nonlocal draw_std_vec nonlocal linestyle_vec nonlocal linewidth_vec nonlocal marker_vec nonlocal markersize_vec nonlocal legend_location nonlocal option_vec nonlocal learning_method_vec nonlocal Macro_Accuracy nonlocal EC nonlocal constraints nonlocal weight_vec nonlocal randomize_vec nonlocal k nonlocal err nonlocal avoidNeighbors nonlocal convergencePercentage_W nonlocal stratified nonlocal gradient nonlocal doubly_stochastic nonlocal num_restarts nonlocal numberOfSplits nonlocal H_heuristic nonlocal select_lambda_vec nonlocal lambda_vec nonlocal f_vec if choice == 0: None elif choice == 304: ## with varying weights FILENAMEZ = 'prop37' Macro_Accuracy = True gradient = True fig_label = 'Prop37' legend_location = 'lower right' n = 62000 d = 34.8 select_lambda_vec = [False] * 5 f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] elif choice == 305: # DCEr Only experiment choose(605) choose(304) select_lambda_vec = [False] * 6 elif choice == 306: choose(304) select_lambda_vec = [False] * 3 + [True] * 3 lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec learning_method_vec.append('Holdout') labels.append('Holdout') elif choice == 307: # heuristic comparison choose(304) select_lambda_vec = [False] * 3 + [True] * 3 lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec learning_method_vec.append('Heuristic') labels.append('Heuristic') H_heuristic = np.array([[.476, .0476, .476], [.476, .0476, .476], [.476, .476, .0476]]) # -- MovieLens dataset elif choice == 401: FILENAMEZ = 'movielens' Macro_Accuracy = True gradient = True fig_label = 'MovieLens' legend_location = 'upper left' n = 26850 d = 25.0832029795 elif choice == 402: choose(401) select_lambda_vec = [False] * 3 + [ True ] * 3 # allow to choose lambda for different f in f_vec lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 403: choose(402) ymin = 0.3 ymax = 1.0 learning_method_vec.append('Holdout') labels.append('Holdout') elif choice == 404: choose(401) select_lambda_vec = [ True ] * 3 # allow to choose lambda for different f in f_vec lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec labels = ['GS', 'DCEr', 'Homophily'] facecolor_vec = ['black', "#C44E52", "#64B5CD"] draw_std_vec = [False, True, False] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [4, 2, 2, 2, 2] marker_vec = [None, '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8] weight_vec = [None, 10, None] option_vec = ['opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6'] randomize_vec = [False, True, False] learning_method_vec = ['GT', 'DHE'] #TODO elif choice == 405: # DCEr ONLY experiment choose(605) choose(401) learning_method_vec += ['Holdout'] labels += ['Holdout'] elif choice == 406: # comparison with a static heuristic matrix choose(402) learning_method_vec += ['Heuristic'] labels += ['Heuristic'] H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476], [.476, .476, .0476]]) elif choice == 407: choose(402) ymin = 0.3 ymax = 1.0 lambda_vec = [1] * 21 # same length as f_vec elif choice == 408: choose(402) ymin = 0.3 ymax = 1.0 lambda_vec = [10] * 21 # same length as f_vec # DO NOT RUN WITH CREATE_DATA=True, if you do please restore the data from # data/sigmod-movielens-fig.csv elif choice == 409: choose(402) facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#8172B2", "#C44E52", "#C44E52", "#CCB974", "#64B5CD" ] labels = [ 'GS', 'LCE', 'MCE', 'DCE1', 'DCE10', 'DCEr1', 'DCEr10', 'Holdout' ] draw_std_vec = [False] * 5 + [True] * 2 + [False] linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [2, 2, 2, 2, 2, 2, 2, 2] marker_vec = [None, 'o', 'x', 's', 'p', '^', 'v', '+'] markersize_vec = [0, 8, 8, 8, 8, 8, 8, 8] option_vec = [ 'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8' ] legend_location = 'upper left' ymin = 0.3 ymax = 1.0 lambda_vec = [10] * 21 # same length as f_vec # -- Yelp dataset elif choice == 501: FILENAMEZ = 'yelp' Macro_Accuracy = True weight_vec = [None] * 3 + [10, 10] gradient = True ymin = 0.1 ymax = 0.75 fig_label = 'Yelp' legend_location = 'upper left' n = 4301900 # for figure d = 6.56 # for figure # -- Flickr dataset elif choice == 601: FILENAMEZ = 'flickr' Macro_Accuracy = True fig_label = 'Flickr' legend_location = 'lower right' ymin = 0.3 ymax = 0.7 n = 2007369 d = 18.1 elif choice == 602: ## with varying weights choose(601) select_lambda_vec = [False] * 4 + [ True ] * 2 # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 603: ## with varying weights choose(602) select_lambda_vec = [False] * 3 + [ True ] * 2 # allow to choose lambda for different f in f_vec # lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [1] * 6 # same length as f_vec elif choice == 604: ## with weight = 1 choose(603) lambda_vec = [0.5] * 21 # same length as f_vec elif choice == 605: choose(601) facecolor_vec = [ 'black', "#55A868", "#4C72B0", "#8172B2", "#C44E52", "#CCB974", "#64B5CD", 'orange' ] draw_std_vec = [False] + [True] * 10 linestyle_vec = ['dashed'] + ['solid'] * 10 linewidth_vec = [3] * 10 marker_vec = [None, 'o', 'x', '^', 'v', '+', 'o', 'x'] markersize_vec = [0] + [8] * 10 randomize_vec = [True] * 8 option_vec = [ 'opt1', 'opt2', 'opt3', 'opt4', 'opt5', 'opt6', 'opt7', 'opt8' ] learning_method_vec = [ 'GT', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE', 'DHE' ] select_lambda_vec = [False] * 8 f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec weight_vec = [0, 0, 1, 2, 5, 10, 15] labels = ['GT'] + [ i + ' {}'.format(weight_vec[ix]) for ix, i in enumerate(['DCEr'] * 6) ] elif choice == 606: # heuristic experiment choose(602) labels.append('Heuristic') learning_method_vec.append('Heuristic') H_heuristic = np.array([[.0476, .476, .476], [.476, .0476, .476], [.476, .476, .0476]]) # -- DBLP dataset elif choice == 701: FILENAMEZ = 'dblp' Macro_Accuracy = True ymin = 0.2 ymax = 0.5 fig_label = 'DBLP' legend_location = 'lower right' n = 2241258 # for figure d = 26.11 # for figure # -- ENRON dataset elif choice == 801: FILENAMEZ = 'enron' Macro_Accuracy = True ymin = 0.3 ymax = 0.75 fig_label = 'Enron' f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] legend_location = 'upper left' n = 46463 # for figures d = 23.4 # for figures elif choice == 802: ### WITH ADAPTIVE WEIGHTS choose(801) select_lambda_vec = [False] * 4 + [ True ] * 2 # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 803: ### WITH ADAPTIVE WEIGHTS choose(802) lambda_vec = [1] * 5 + [5] * 5 + [10] * 5 + [ 1 ] * 6 # same length as f_vec elif choice == 804: choose(803) elif choice == 805: choose(605) choose(801) #learning_method_vec += ['Holdout'] #labels += ['Holdout'] elif choice == 806: # Heuristic experiment choose(802) learning_method_vec += ['Heuristic'] labels += ['Heuristic'] H_heuristic = np.array([[0.76, 0.08, 0.08, 0.08], [0.08, 0.08, 0.76, 0.08], [0.08, 0.76, 0.08, 0.76], [0.08, 0.08, 0.76, 0.08]]) # MASC Dataset elif choice == 901: FILENAMEZ = 'masc' Macro_Accuracy = False fig_label = 'MASC' legend_location = 'lower right' n = 0 d = 0 ymin = 0 num_restarts = 100 select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec # MASC collapsed Dataset elif choice == 1001: FILENAMEZ = 'masc-collapsed' fig_label = 'MASC Collapsed' legend_location = 'lower right' n = 43724 d = 7.2 ymin = 0 num_restarts = 20 select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 1002: choose(1001) Macro_Accuracy = True # MASC Reduced dataset elif choice == 1101: FILENAMEZ = 'masc-reduced' fig_label = 'MASC Reduced' legend_location = 'lower right' n = 31000 d = 8.3 ymin = 0 select_lambda_vec = [False] * 4 + [ True ] # allow to choose lambda for different f in f_vec f_vec = [0.9 * pow(0.1, 1 / 5)**x for x in range(21)] lambda_vec = [1] * 11 + [10] * 10 # same length as f_vec elif choice == 1102: choose(1101) Macro_Accuracy = True else: raise Warning("Incorrect choice!") def _f_worker_(X0, W, f, f_index): RANDOMSEED = None # For repeatability random.seed(RANDOMSEED) # seeds some other python random generator np.random.seed( seed=RANDOMSEED ) # seeds the actually used numpy random generator; both are used and thus needed X1, ind = replace_fraction_of_rows(X0, 1 - f, avoidNeighbors=avoidNeighbors, W=W, stratified=stratified) X2 = introduce_errors(X1, ind, err) for option_index, (label, select_lambda, learning_method, alpha, beta, gamma, s, numMaxIt, weights, randomize) in \ enumerate(zip(labels, select_lambda_vec, learning_method_vec, alpha_vec, beta_vec, gamma_vec, s_vec, numMaxIt_vec, weight_vec, randomize_vec)): learn_time = -1 # -- Learning if learning_method == 'GT': H2c = H0c elif learning_method == 'Heuristic': # print('Heuristic') H2c = H_heuristic elif learning_method == 'Holdout': # print('Holdout') H2 = estimateH_baseline_serial( X2, ind, W, numMax=numMaxIt, # ignore_rows=ind, numberOfSplits=numberOfSplits, # method=learning_method, variant=1, # distance=length, EC=EC, alpha=alpha, beta=beta, gamma=gamma, doubly_stochastic=doubly_stochastic) H2c = to_centering_beliefs(H2) else: if "DCEr" in learning_method: learning_method = "DCEr" elif "DCE" in learning_method: learning_method = "DCE" # -- choose optimal lambda: allows to specify different lambda for different f # print("option: ", option_index) if select_lambda == True: weight = lambda_vec[f_index] # print("weight : ", weight) else: weight = weights # -- learn H learn_start = time.time() H2 = estimateH(X2, W, method=learning_method, variant=1, distance=length, EC=EC, weights=weight, randomrestarts=num_restarts, randomize=randomize, constraints=constraints, gradient=gradient, doubly_stochastic=doubly_stochastic) learn_time = time.time() - learn_start H2c = to_centering_beliefs(H2) # if learning_method not in ['GT', 'GS']: # print(FILENAMEZ, f, learning_method) # print(H2c) # -- Propagation prop_start = time.time() # X2c = to_centering_beliefs(X2, ignoreZeroRows=True) # try without eps_max = eps_convergence_linbp_parameterized(H2c, W, method='noecho', alpha=alpha, beta=beta, gamma=gamma, X=X2) eps = s * eps_max # print("Max eps: {}, eps: {}".format(eps_max, eps)) # eps = 1 try: F, actualIt, actualPercentageConverged = \ linBP_symmetric_parameterized(X2, W, H2c * eps, method='noecho', alpha=alpha, beta=beta, gamma=gamma, numMaxIt=numMaxIt, convergencePercentage=convergencePercentage_W, debug=2) prop_time = time.time() - prop_start if Macro_Accuracy: accuracy_X = matrix_difference_classwise(X0, F, ignore_rows=ind) precision = matrix_difference_classwise( X0, F, similarity='precision', ignore_rows=ind) recall = matrix_difference_classwise(X0, F, similarity='recall', ignore_rows=ind) else: accuracy_X = matrix_difference(X0, F, ignore_rows=ind) precision = matrix_difference(X0, F, similarity='precision', ignore_rows=ind) recall = matrix_difference(X0, F, similarity='recall', ignore_rows=ind) result = [str(datetime.datetime.now())] text = [ label, f, accuracy_X, precision, recall, learn_time, prop_time ] result.extend(text) # print("method: {}, f: {}, actualIt: {}, accuracy: {}, precision:{}, recall: {}, learning time: {}, propagation time: {}".format(label, f, actualIt, accuracy_X, precision, recall, learn_time, prop_time)) save_csv_record(join(data_directory, csv_filename), result) except ValueError as e: print("ERROR: {} with {}: d={}, h={}".format( e, learning_method, d, h)) raise e return 'success' def multi_run_wrapper(args): """Wrapper to unpack arguments passed to the pool worker. NOTE: This method could be removed by upgrading to Python>=3.3, which includes the multiprocessing.starmap_async() function, which allows multiple arguments to be passed to the map function. """ return _f_worker_(*args) for choice in experiments: choose(choice) filename = 'Fig_End-to-End_accuracy_realData_{}_{}'.format( choice, FILENAMEZ) csv_filename = '{}.csv'.format(filename) header = [ 'currenttime', 'method', 'f', 'accuracy', 'precision', 'recall', 'learntime', 'proptime' ] if CREATE_DATA: save_csv_record(join(data_directory, csv_filename), header, append=False) # print("choice: {}".format(choice)) # --- print data statistics if CALCULATE_DATA_STATISTICS: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) d = (len(W.nonzero()[0]) * 2) / n k = len(X0[0]) print("FILENAMEZ:", FILENAMEZ) print("k:", k) print("n:", n) print("d:", d) # -- Graph statistics n_vec = calculate_nVec_from_Xd(Xd) print("n_vec:\n", n_vec) d_vec = calculate_average_outdegree_from_graph(W, Xd=Xd) print("d_vec:\n", d_vec) P = calculate_Ptot_from_graph(W, Xd) print("P:\n", P) for i in range(k): Phi = calculate_degree_correlation(W, X0, i, NB=True) print("Degree Correlation, Class {}:\n{}".format(i, Phi)) # -- Various compatibilities H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) print("H0 w/ constraints:\n", np.round(H0, 2)) #raw_input() # Why? H2 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H4 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H5 = estimateH(X0, W, method='DHE', variant=1, distance=1, EC=EC, weights=2, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) H6 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, gradient=gradient, doubly_stochastic=doubly_stochastic) H7 = estimateH(X0, W, method='DHE', variant=1, distance=2, EC=EC, weights=10, randomize=False, constraints=True, gradient=gradient, doubly_stochastic=doubly_stochastic) print() # print("H MCE w/o constraints:\n", np.round(H0, 3)) print("H MCE w/ constraints:\n", np.round(H2, 3)) # print("H DCE 2 w/o constraints:\n", np.round(H4, 3)) print("H DCE 2 w/ constraints:\n", np.round(H5, 3)) # print("H DCE 10 w/o constraints:\n", np.round(H6, 3)) print("H DCE 20 w/ constraints:\n", np.round(H7, 3)) print() H_row_vec = H_observed(W, X0, 3, NB=True, variant=1) print("H_est_1:\n", np.round(H_row_vec[0], 3)) print("H_est_2:\n", np.round(H_row_vec[1], 3)) print("H_est_3:\n", np.round(H_row_vec[2], 3)) # --- Create data if CREATE_DATA or ADD_DATA: Xd, W = load_Xd_W_from_csv( join(realDataDir, FILENAMEZ) + '-classes.csv', join(realDataDir, FILENAMEZ) + '-neighbors.csv') X0 = from_dictionary_beliefs(Xd) n = len(Xd.keys()) ## number of nodes in graph k = len(X0[0]) d = (len(W.nonzero()[0]) * 2) / n #print(n) #print(d) #print("contraint = {}".format(constraints)) #print('select lambda: {}'.format(len(select_lambda_vec))) #print('learning method: {}'.format(len(learning_method_vec))) #print('alpha: {}'.format(len(alpha_vec))) #print('beta: {}'.format(len(beta_vec))) #print('gamma: {}'.format(len(gamma_vec))) #print('s: {}'.format(len(s_vec))) #print('maxit: {}'.format(len(numMaxIt_vec))) #print('weight: {}'.format(len(weight_vec))) #print('randomize: {}'.format(len(randomize_vec))) # --- Calculating True Compatibility matrix H0 = estimateH(X0, W, method='MHE', variant=1, distance=1, EC=EC, weights=1, randomize=False, constraints=constraints, gradient=gradient, doubly_stochastic=doubly_stochastic) # print(H0) H0c = to_centering_beliefs(H0) num_results = len(f_vec) * len(learning_method_vec) * rep_SameGraph # Starts a thread pool with 10 fewer than the max number your computer # has available assuming one thread per cpu - this is meant for # supercomputer. #pool = multiprocessing.Pool(int(multiprocessing.cpu_count()-10)) # Use this for a reasonably powerful home computer #pool = multiprocessing.Pool(int(multiprocessing.cpu_count()/2)) # Use this for anything else pool = multiprocessing.Pool(2) f_processes = f_vec * rep_SameGraph workers = [] results = [(X0, W, f, ix) for ix, f in enumerate(f_vec)] * rep_SameGraph # print('Expected results: {}'.format(num_results)) try: # hacky fix due to a bug in 2.7 multiprocessing # Distribute work for evaluating accuracy over the thread pool using # a hacky method due to python 2.7 multiprocessing not being fully # featured pool.map_async(multi_run_wrapper, results).get(num_results * 2) except multiprocessing.TimeoutError as e: continue finally: pool.close() pool.join() # -- Read data for all options and plot df1 = pd.read_csv(join(data_directory, csv_filename)) acc_filename = '{}_accuracy_plot.pdf'.format(filename) pr_filename = '{}_PR_plot.pdf'.format(filename) if TIMING: print('=== {} Timing Results ==='.format(FILENAMEZ)) print('Prop Time:\navg: {}\nstddev: {}'.format( np.average(df1['proptime'].values), np.std(df1['proptime'].values))) for learning_method in labels: rs = df1.loc[df1["method"] == learning_method] avg = np.average(rs['learntime']) std = np.std(rs['learntime']) print('{} Learn Time:\navg: {}\nstd: {}'.format( learning_method, avg, std)) sslhv.plot(df1, join(figure_directory, acc_filename), n=n, d=d, k=k, labels=labels, dataset=FILENAMEZ, line_styles=linestyle_vec, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, marker_sizes=markersize_vec, draw_stds=draw_std_vec, markers=marker_vec, line_colors=facecolor_vec, line_widths=linewidth_vec, legend_location=legend_location, show=SHOW_PDF, save=CREATE_PDF, show_plot=SHOW_PLOT)