def splitDiGraphToTrainTest2(di_graph, train_ratio=0.5, is_undirected=True, file_name=None): train_digraph, test_digraph = evaluation_util.splitDiGraphToTrainTest( di_graph, train_ratio=train_ratio, is_undirected=is_undirected) if not nx.is_connected(train_digraph.to_undirected()): train_digraph = max( nx.weakly_connected_component_subgraphs(train_digraph), key=len) tdl_nodes = train_digraph.nodes() nodeListMap = dict(zip(tdl_nodes, range(len(tdl_nodes)))) train_digraph = nx.relabel_nodes(train_digraph, nodeListMap, copy=True) test_digraph = test_digraph.subgraph(tdl_nodes) test_digraph = nx.relabel_nodes(test_digraph, nodeListMap, copy=True) if (file_name): with open(file_name + "_train", 'w') as f: for (st, ed, w) in train_graph.edges(data='weight', default=1): f.write('%d %d %f\n' % (st, ed)) with open(file_name + "_test", 'w') as f: for (st, ed, w) in test_graph.edges(data='weight', default=1): f.write('%d %d %f\n' % (st, ed)) return (train_digraph, test_digraph)
def evaluateStaticLinkPrediction(digraph, graph_embedding, train_ratio=0.8, n_sample_nodes=None, sample_ratio_e=None, no_python=False, is_undirected=True): node_num = digraph.number_of_nodes() # seperate train and test graph train_digraph, test_digraph = evaluation_util.splitDiGraphToTrainTest( digraph, train_ratio=train_ratio, is_undirected=is_undirected) if not nx.is_connected(train_digraph.to_undirected()): train_digraph = max( nx.weakly_connected_component_subgraphs(train_digraph), key=len) tdl_nodes = train_digraph.nodes() nodeListMap = dict(zip(tdl_nodes, range(len(tdl_nodes)))) nx.relabel_nodes(train_digraph, nodeListMap, copy=False) test_digraph = test_digraph.subgraph(tdl_nodes) nx.relabel_nodes(test_digraph, nodeListMap, copy=False) # learning graph embedding X, _ = graph_embedding.learn_embedding(graph=train_digraph, no_python=no_python) node_l = None if n_sample_nodes: test_digraph, node_l = graph_util.sample_graph(test_digraph, n_sample_nodes) X = X[node_l] # evaluation if sample_ratio_e: eval_edge_pairs = evaluation_util.getRandomEdgePairs( node_num, sample_ratio_e, is_undirected) else: eval_edge_pairs = None estimated_adj = graph_embedding.get_reconstructed_adj(X, node_l) predicted_edge_list = evaluation_util.getEdgeListFromAdjMtx( estimated_adj, is_undirected=is_undirected, edge_pairs=eval_edge_pairs) if node_l is None: node_l = list(range(train_digraph.number_of_nodes())) filtered_edge_list = [ e for e in predicted_edge_list if not train_digraph.has_edge(node_l(e[0]), node_l(e[1])) ] MAP = metrics.computeMAP(filtered_edge_list, test_digraph) prec_curv, _ = metrics.computePrecisionCurve(filtered_edge_list, test_digraph) return (MAP, prec_curv)
def evaluate_supervised(di_graph, graph_embedding, is_undirected=True): train_digraph, test_digraph = train_test_split.splitDiGraphToTrainTest2( di_graph, train_ratio=0.6, is_undirected=True) train_digraph1, test_digraph = evaluation_util.splitDiGraphToTrainTest( test_digraph, train_ratio=0.5, is_undirected=is_undirected) X, _ = graph_embedding.learn_embedding(graph=train_digraph, no_python=False) trp, trn = create_edge_dataset(train_digraph, train_digraph1) trd, trl = create_vector_dataset(trp, trn, hadamard2, X) mean = np.mean(trd, axis=0) std = np.std(trd, axis=0) trd = (trd - mean) / std clasifier = train_classifier(trd, trl) for (st, ed) in train_digraph1.edges(): train_digraph.add_edge(st, ed) sample_edges = sample_edge_new(train_digraph, test_digraph, 0.5) X, _ = graph_embedding.learn_embedding(graph=train_digraph, no_python=False) filtered_edge_list = getscore5(train_digraph, sample_edges, clasifier, hadamard2, X, mean, std) AP, ROC = scores.computeAP_ROC(filtered_edge_list, test_digraph) test_digraph, node_l = graph_util.sample_graph(test_digraph, 1024) X = X[node_l] estimated_adj = getscore2(train_digraph, node_l, clasifier, hadamard2, X, mean, std) predicted_edge_list = evaluation_util.getEdgeListFromAdjMtx( estimated_adj, is_undirected=True) filtered_edge_list = [ e for e in predicted_edge_list if not train_digraph.has_edge(node_l[e[0]], node_l[e[1]]) ] MAP = scores.computeMAP(filtered_edge_list, test_digraph) print(MAP) return AP, ROC, MAP
for grp in xrange(len(list_graphs)): for x in xrange(num_samples): # load the graph as a networkx graph G = graph_util.loadGraphFromEdgeListTxt(list_graphs[grp], directed=list_directed[grp]) G = G.to_directed() if not os.path.exists('SAVER_SUP/'+fig_name[grp]+str(x+1)): os.makedirs('SAVER_SUP/'+fig_name[grp]+str(x+1)) # split the graph into 60-20-20 ratio, 60% for calculating the edge features, 20% for training the classifier, 20% for evaluating the model. train_digraph, test_digraph = train_test_split.splitDiGraphToTrainTest2(G, train_ratio = 0.6, is_undirected=True) train_digraph1, test_digraph = evaluation_util.splitDiGraphToTrainTest(test_digraph, train_ratio=0.5, is_undirected=True) # embeddings without relearning print ("saving for LE") for dim in dimensions: embedding=LaplacianEigenmaps(d=dim) X, _ = embedding.learn_embedding(graph=train_digraph, no_python=False) file_name='SAVER_SUP/'+fig_name[grp]+str(x+1)+'/LE1_'+str(dim) parameter_file=open(file_name, 'wb') pickle.dump(X,parameter_file) parameter_file.close() print ("saving for DEEPWALK") for dim in dimensions: embedding=node2vec(d=dim, max_iter=1, walk_len=80, num_walks=10, con_size=10, ret_p=1, inout_p=1)
def expLP(digraph, graph_embedding, n_sample_nodes_l, rounds, res_pre, m_summ, train_ratio=0.8, no_python=True, K=32768, is_undirected=True, sampling_scheme="u_rand"): print('\tLink Prediction') MAP = {} prec_curv = {} n_sample_nodes_l = [min(int(n), digraph.number_of_nodes()) for n in n_sample_nodes_l] # Randomly hide (1-train_ratio)*100% of links node_num = digraph.number_of_nodes() train_digraph, test_digraph = evaluation_util.splitDiGraphToTrainTest( digraph, train_ratio=train_ratio, is_undirected=is_undirected ) # Ensure the resulting train subgraph is connected if not nx.is_connected(train_digraph.to_undirected()): train_digraph = max( nx.weakly_connected_component_subgraphs(train_digraph), key=len ) tdl_nodes = train_digraph.nodes() nodeListMap = dict(zip(tdl_nodes, range(len(tdl_nodes)))) train_digraph = nx.relabel_nodes(train_digraph, nodeListMap, copy=True) test_digraph = test_digraph.subgraph(tdl_nodes) ### unfroze the graph test_digraph = nx.Graph(test_digraph) ####nx.relabel_nodes(test_digraph, nodeListMap, copy=False) test_digraph = nx.relabel_nodes(test_digraph, nodeListMap, copy=True) pickle.dump(nodeListMap, open('gem/nodeListMap/lp_lcc.pickle', 'wb')) t1 = time() # learn graph embedding on train subgraph print( 'Link Prediction train graph n_nodes: %d, n_edges: %d' % ( train_digraph.number_of_nodes(), train_digraph.number_of_edges()) ) X, _ = graph_embedding.learn_embedding( graph=train_digraph, no_python=no_python ) if X is not None and X.shape[0] != train_digraph.number_of_nodes(): pdb.set_trace() print('Time taken to learn the embedding: %f sec' % (time() - t1)) # sample test graph for evaluation and store results node_l = None if not n_sample_nodes_l: n_sample_nodes_l = [node_num] summ_file = open('%s_%s_%s.lpsumm' % (res_pre, m_summ, sampling_scheme), 'w') summ_file.write('Method\t%s\n' % metrics.getMetricsHeader()) for n_s in n_sample_nodes_l: n_s = int(n_s) n_s = min(n_s, train_digraph.number_of_nodes()) MAP[n_s] = [None] * rounds prec_curv[n_s] = [None] * rounds for round_id in range(rounds): if sampling_scheme == "u_rand": train_digraph_s, node_l = graph_util.sample_graph( train_digraph, n_s ) else: train_digraph_s, node_l = graph_util.sample_graph_rw( train_digraph, n_s ) if X is not None: X_sub = X[node_l] else: X_sub = None test_digraph_s = test_digraph.subgraph(node_l) nodeListMap = dict(zip(node_l, range(len(node_l)))) pickle.dump(nodeListMap, open('gem/nodeListMap/lp_lcc_samp.pickle', 'wb')) test_digraph_s = nx.relabel_nodes(test_digraph_s, nodeListMap, copy=True) MAP[n_s][round_id], prec_curv[n_s][round_id] = \ evaluateStaticLinkPrediction(train_digraph_s, test_digraph_s, graph_embedding, X_sub, node_l=node_l, is_undirected=is_undirected) prec_curv[n_s][round_id] = prec_curv[n_s][round_id][:K] summ_file.write('\tn_s:%d, %f/%f\t%s\n' % ( n_s, np.mean(MAP[n_s]), np.std(MAP[n_s]), metrics.getPrecisionReport( prec_curv[n_s][0], len(prec_curv[n_s][0]) ) )) summ_file.close() #if len(prec_curv[-1][0]) < 100: #pdb.set_trace() pickle.dump([MAP, prec_curv, n_sample_nodes_l], open('%s_%s_%s_%s.lp' % (res_pre, m_summ, sampling_scheme, str(train_ratio)), 'wb')) print('Link prediction evaluation complete. Time: %f sec' % (time() - t1)) # prec_curv2 = [p[4096] for p in prec_curv[prec_curv.keys()[0]]] return MAP[list(MAP.keys())[0]] # prec_curv2
def evaluateStaticLinkPrediction(digraph, graph_embedding, train_ratio=0.8, n_sample_nodes=None, sample_ratio_e=None, no_python=False, is_undirected=True): node_num = digraph.number_of_nodes() print('eslp graph') print(digraph.edges()[:3]) # seperate train and test graph train_digraph, test_digraph = evaluation_util.splitDiGraphToTrainTest( digraph, train_ratio=train_ratio, is_undirected=is_undirected ) print('eslp training graph') print(train_digraph.edges()[:3]) if not nx.is_connected(train_digraph.to_undirected()): train_digraph = max( nx.weakly_connected_component_subgraphs(train_digraph), key=len ) tdl_nodes = train_digraph.nodes() nodeListMap = dict(zip(tdl_nodes, range(len(tdl_nodes)))) reversedNodeListMap = dict(zip(range(len(tdl_nodes)),tdl_nodes)) print(nodeListMap) nx.relabel_nodes(train_digraph, nodeListMap, copy=False) test_digraph = test_digraph.subgraph(tdl_nodes) nx.relabel_nodes(test_digraph, nodeListMap, copy=False) else: reversedNodeListMap = dict(zip(tdl_nodes,tdl_nodes)) print('elsp training graph after largest cc') print(train_digraph.edges()[:3]) # learning graph embedding X, _ = graph_embedding.learn_embedding( graph=train_digraph, no_python=no_python ) node_l = None if n_sample_nodes: test_digraph, node_l = graph_util.sample_graph( test_digraph, n_sample_nodes ) X = X[node_l] # print('len graph edges') # print(len(graph.nodes())) # print('embedding vectors number') # print(len(self._X)) node2vec_dict = {} print('GUESS embedding node2vc train result') for i in range(len(X)): node2vec_dict[reversedNodeListMap[train_digraph.nodes()[i]]] = X[i] # print(str(train_digraph.nodes()[i])+" "+str(reversedNodeListMap[train_digraph.nodes()[i]]) + " "+ str(X[i])) # evaluation if sample_ratio_e: eval_edge_pairs = evaluation_util.getRandomEdgePairs( node_num, sample_ratio_e, is_undirected ) else: eval_edge_pairs = None estimated_adj = graph_embedding.get_reconstructed_adj(X, node_l) predicted_edge_list = evaluation_util.getEdgeListFromAdjMtx( estimated_adj, is_undirected=is_undirected, edge_pairs=eval_edge_pairs ) if node_l is None: node_l = list(range(train_digraph.number_of_nodes())) filtered_edge_list = [e for e in predicted_edge_list if not train_digraph.has_edge(node_l[e[0]], node_l[e[1]])] MAP = metrics.computeMAP(filtered_edge_list, test_digraph) prec_curv, _ = metrics.computePrecisionCurve( filtered_edge_list, test_digraph ) return (MAP, prec_curv, node2vec_dict)
def evaluate_supervised_new(train_digraph, embeddings, hads, is_undirected=True): train_digraph, test_digraph = train_test_split.splitDiGraphToTrainTest2( train_digraph, train_ratio=0.6, is_undirected=True) for (st, ed) in train_digraph.edges(): if (test_digraph.has_edge(st, ed)): test_digraph.remove_edge(st, ed) train_digraph1, test_digraph = evaluation_util.splitDiGraphToTrainTest( test_digraph, train_ratio=0.5, is_undirected=is_undirected) l_emb = [] combine = [] for emb in embeddings: X, _ = emb.learn_embedding(graph=train_digraph, no_python=False) l_emb.append(X) for had in hads: if (had == 1): combine.append(hadamard1) elif (had == 0): combine.append(hadamard2) # combine.append(dotp1) print("embeddings learned") trp, trn = create_edge_dataset(train_digraph, train_digraph1) trd, trl = create_mix_dataset(trp, trn, train_digraph, l_emb, combine) mean = np.mean(trd, axis=0) std = np.std(trd, axis=0) trd = (trd - mean) / std clasifier = train_classifier(trd, trl) # print (clasifier.coef_) # print (clasifier.intercept_) train_digraph_temp = train_digraph.copy() for (st, ed) in train_digraph1.edges(): train_digraph_temp.add_edge(st, ed) sample_edges = sample_edge_new(train_digraph_temp, test_digraph, -1, num_edges=500000) # co=0 # for (st,ed) in sample_edges: # for (st1,ed1) in trn: # if(st==st1 and ed==ed1): # if(test_digraph.has_edge(st,ed)): # print ("1") # for (st1,ed1) in trp: # if(st==st1 and ed==ed1): # if(test_digraph.has_edge(st,ed)): # print ("2") # else: # print ("3") # l_emb1 = [] # for emb in embeddings: # X, _ = emb.learn_embedding(graph=train_digraph_temp, no_python=False) # l_emb1.append(X) # break # l_emb1.append(l_emb[1]) print("embeddings learned") # filtered_edge_list = getscore9(train_digraph, sample_edges, clasifier, l_emb1, combine, mean, std) # AP, ROC = scores.computeAP_ROC(filtered_edge_list, test_digraph) # print (AP,ROC) filtered_edge_list = getscore9(train_digraph_temp, sample_edges, clasifier, l_emb, combine, mean, std) AP, ROC = scores.computeAP_ROC(filtered_edge_list, test_digraph) print(AP, ROC) trd, trl = create_score_dataset(trp, trn, allh, train_digraph) mean = np.mean(trd, axis=0) std = np.std(trd, axis=0) trd = (trd - mean) / std clasifier = train_classifier(trd, trl) filtered_edge_list = getscore7(train_digraph_temp, sample_edges, clasifier, allh, mean, std) AP2, ROC2 = scores.computeAP_ROC(filtered_edge_list, test_digraph) print(AP2, ROC2) # G11 = train_digraph.to_undirected() # f1=[] # f2=[] # for (st,ed,w) in filtered_edge_list: # f1.append(w) # f2.append(cn(G11,st,ed)) # f1=np.array(f1) # f2=np.array(f2) # ind1 = np.argsort(-1*f1) # ind2 = np.argsort(-1*f2) # print (ind1[:1000]) # print (ind2[:1000]) # print (f1[ind1[:1000]]) # print (f2[ind1[:1000]]) # filtered_edge_list = getscore3(train_digraph_temp, sample_edges, aa) # AP, ROC = scores.computeAP_ROC(filtered_edge_list, test_digraph) # print (AP,ROC) return AP, ROC # labels=[] # score=[] # dist=[] # G=train_digraph.to_undirected() # print (len(filtered_edge_list)) # for (st,ed,w) in filtered_edge_list: # # if not(nx.shortest_path_length(G,source=st,target=ed)==2): # # continue # if(test_digraph.has_edge(st,ed)): # labels.append(1) # else: # labels.append(0) # score.append(w) # ap = average_precision_score(labels, score) # print (ap) # ind = np.argsort(-1*np.asarray(score)) # labels = np.array(labels) # print (labels[ind[:1000]]) # labels=[] # score=[] # dist=[] # G=train_digraph.to_undirected() # for (st,ed,w) in filtered_edge_list: # if (nx.shortest_path_length(G,source=st,target=ed)==2): # continue # if(test_digraph.has_edge(st,ed)): # labels.append(1) # else: # labels.append(0) # score.append(w) # ap = average_precision_score(labels, score) # print (ap) # ind = np.argsort(-1*np.asarray(score)) # labels = np.array(labels) # print (labels[ind[:1000]]) # test_digraph, node_l = graph_util.sample_graph(test_digraph, 1024) # estimated_adj = getscore8(train_digraph, node_l, clasifier, l_emb1, combine) # predicted_edge_list = evaluation_util.getEdgeListFromAdjMtx(estimated_adj,is_undirected=True) # filtered_edge_list = [e for e in predicted_edge_list if not train_digraph.has_edge(node_l[e[0]], node_l[e[1]])] # MAP = scores.computeMAP(filtered_edge_list, test_digraph) # print (MAP) MAP = 0 return AP, ROC, MAP