def compare_node_embedding(vary_q=True): par = parSet( dim=25, walk=15, num_walk=30, p=0.2, q=0.5 ) root = Path(os.getcwd()).parent.parent root = str(root) + os.sep + 'data/graphs/benign/3abfa08b4e1de7195c8e9fe52796a37f9a275cb47f6d0fc904eed172061cd56a.apk.top.dot' G = Graph(dot_file=root, config=default_config) target = list(G.nodes)[np.random.randint(len(list(G.nodes)))] res = [] for ran_1 in ran: if vary_q: p = par.p q = par.q + ran_1 else: p =par.p + ran_1 q = par.q par_1 = parSet(dim = par.dim, walk = par.walk, num_walk = par.num_walk, p=p, q=q) mapping = node2vec_mapping(FILE, G, par_1) res.append(mapping[target]) sns.set() if vary_q: y_label=np.array(par.q + np.array(ran)) else: y_label= [round(r + par.p,2) for r in ran] pl = sns.heatmap(np.array(res),yticklabels=y_label) pl.set(xlabel=par.__str__()) plt.title('Different node embedding on the same node') if vary_q: plt.ylabel('q') else: plt.ylabel('p') plt.xlabel('dimension') fig = pl.get_figure() if vary_q: fig.savefig( 'q: ' + par.__str__() + '.png') else: fig.savefig( 'p: ' + par.__str__() + '.png') fig.clf()
def main(): # arbitrary parameter set # par = parSet( # dim=250, # walk=15, # num_walk=30, # p=5.0, # q=0.05 # ) par = parSet( dim=250, walk=15, num_walk=100, p=0.5, q=0.8 ) fabricate() to_vector(par) d = read_p() candidates = [2, 5, 10, 15, 20] res = [] for cand in candidates: dis = np.linalg.norm(np.array(d['1']) - np.array(d[str(cand)])) res.append(dis) with open('distance_test/result.txt', 'w') as filehandle: for i in range(len(candidates)): filehandle.write('%s: %s\n' % (candidates[i], res[i]))
def grid_search(): t = true_val(src='metadata', merge=True) dimSet = [5, 10, 25, 50, 70, 100, 128, 200, 250] mean = [] std = [] for dim in dimSet: par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5) mean_t, std_t = test(par, t, src="metadata") mean.append(mean_t) std.append(std_t) plt.figure(1) plt.plot(dimSet, mean) plt.xlabel('dimension') plt.ylabel('accuracy') plt.title('dimension vs accuracy') plt.show() plt.figure(2) plt.plot(dimSet, std) plt.xlabel('dimension') plt.ylabel('standard deviation of accuracy') plt.title('dimension vs standard deviation of accuracy') plt.show() print(mean) print(std)
def compare_graph(vary_q=True): par = parSet( dim=25, walk=15, num_walk=30, p=0.2, q=0.5 ) compare(par,vary_q=vary_q)
def dataset_test_binary(src='metadata', fn=tru_bin, cv=10, name='Binary', ran=[50, 100, 150, 200, 250, 314]): par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5) main(par, src=src) prev_method(src=src) t = fn(src=src, merge=False) lib_prev = reading_lib('final_result_prev.pickle') lib_new = reading_lib('final_result.pickle') diff = list(set(lib_prev.keys()) - set(t.keys())) for d in diff: del lib_prev[d] del lib_new[d] new = [] prev = [] for ran_1 in ran: temp1 = [] temp2 = [] for ran_2 in range(100): selected_vecs, selected_prev, selected_tru = draw( size=ran_1, lib_new=lib_new, lib_prev=lib_prev, trueval=t) selected_vecs, selected_prev, selected_tru_w = process_data( selected_vecs, selected_tru, selected_prev) clf = RandomForestClassifier(n_estimators=100, max_depth=50, random_state=0) scores_new = cross_val_score(clf, selected_vecs, selected_tru_w, cv=cv) clf_2 = RandomForestClassifier(n_estimators=100, max_depth=50, random_state=0) scores_prev = cross_val_score(clf_2, selected_prev, selected_tru_w, cv=cv) temp1.append(scores_new.mean()) temp2.append(scores_prev.mean()) new.append(sum(temp1) / len(temp1)) prev.append(sum(temp2) / len(temp2)) # print(mean) plt.plot(ran, new, '-g', label='new method') plt.plot(ran, prev, '-b', label='previous method') plt.legend() plt.xlabel("size of dataset") plt.ylabel(str(cv) + "-fold cross validation accuracy") plt.title(name + " Classification") plt.show()
def compare(src='data/graphs'): # an arbitrary good parameter set par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5) t = true_val(src=src) main(par, src=src) prev_method(src=src) new_mean, new_std = evaluate('final_result.pickle', t) prev_mean, prev_std = evaluate('final_result_prev.pickle', t) with open('compare_result.txt', 'w') as handle: string = "new_mean: {}, new_std: {}, prev_mean: {}, prev_std: {}".format( new_mean, new_std, prev_mean, prev_std) handle.write(string)
def main(): # arbitrary parameter set par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5) fabricate_adjacent() to_vector(par) d = read_p() candidates = [1, 2, 3, 4] res = [] for cand in candidates: dis = np.linalg.norm(np.array(d['0']) - np.array(d[str(cand)])) res.append(dis) with open('distance_test/result_2.txt', 'w') as filehandle: for i in range(len(candidates)): filehandle.write('%s: %s\n' % (candidates[i], res[i]))
def adj_distribution(): par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5) res = [] for i in range(10000): fabricate_adjacent("distance_test/adj_dist.edgelist") to_vector(par, edgelist="distance_test/adj_dist.edgelist", output="distance_test/adj_dist.emb") d = read_p("distance_test/adj_dist.emb") dis = np.linalg.norm(np.array(d['0']) - np.array(d['1'])) res.append(dis) print(len(res)) n, bins, patches = plt.hist(res, 20, facecolor='blue', alpha=0.5) plt.xlabel('distance') plt.ylabel('count') plt.title("adjacent distance distribution") plt.show()
def compare(params,vary_q=True): res = [] for ran_1 in ran: if vary_q: q = params.q + ran_1 p = params.p else: q = params.q p = params.p + ran_1 par = parSet( dim=params.dim, walk=params.walk, num_walk=params.num_walk, q=q, p=p ) # the number of node distances is 2485 in this dataset # d, node_lib, edge_lib = lib_gen(par) g = d[FILE] vec3 = g.distance(node_lib) vec = [x for x in vec3 if x != 2.0] res.append(vec) # 71 nodes sns.set() ylabel = np.array( ran) + params.q pl = sns.heatmap(np.array(res), yticklabels=ylabel,xticklabels=False) # pl.set(xlabel=par.__str__()) fig = pl.get_figure() plt.xlabel('dimension') plt.title('Different featurization on the same graph') if vary_q: plt.ylabel('q') fig.savefig( 'q: ' + par.__str__() + '.png') else: plt.ylabel('p') fig.savefig( 'p: ' + par.__str__() + '.png') fig.clf()
def adjacent_test(): # arbitrary parameter set par = parSet(dim=25, walk=15, num_walk=30, p=0.2, q=0.5) candidates = [1, 2, 3, 4] distances = np.zeros((50, 4)) for i in range(50): fabricate_adjacent() to_vector(par, edgelist="distance_test/adjacent.edgelist", output="distance_test/adjacent.emb") d = read_p("distance_test/adjacent.emb") for index in range(len(candidates)): distances[i][index] = np.linalg.norm( np.array(d['0']) - np.array(d[str(candidates[index])])) sns.set() pl = sns.heatmap(np.moveaxis(distances, 0, -1), yticklabels=candidates) plt.title("Distance between adjacent nodes under same setting") plt.ylabel('Distance from node 0') plt.xlabel('trials') fig = pl.get_figure() fig.savefig("distance_test/adjacent_graph.png")
def distance_test(): par = parSet(dim=250, walk=15, num_walk=100, p=0.5, q=0.8) candidates = [1, 5, 10, 15, 20] distances = np.zeros((50, 5)) for i in range(50): fabricate_distance() to_vector(par, edgelist="distance_test/distance.edgelist", output="distance_test/distance.emb") d = read_p(read_path="distance_test/distance.emb") for index in range(len(candidates)): dis = np.linalg.norm( np.array(d['0']) - np.array(d[str(candidates[index])])) distances[i][index] = dis sns.set() pl = sns.heatmap(np.moveaxis(distances, 0, -1), yticklabels=candidates) plt.title("Distance between loosely conneceted nodes under same setting") plt.ylabel('Distance from node 0') plt.xlabel('trials') fig = pl.get_figure() fig.savefig("distance_test/distance_graph.png")