def splitDiGraphToTrainTest2(di_graph,
                             train_ratio=0.5,
                             is_undirected=True,
                             file_name=None):

    train_digraph, test_digraph = evaluation_util.splitDiGraphToTrainTest(
        di_graph, train_ratio=train_ratio, is_undirected=is_undirected)
    if not nx.is_connected(train_digraph.to_undirected()):
        train_digraph = max(
            nx.weakly_connected_component_subgraphs(train_digraph), key=len)
        tdl_nodes = train_digraph.nodes()
        nodeListMap = dict(zip(tdl_nodes, range(len(tdl_nodes))))
        train_digraph = nx.relabel_nodes(train_digraph, nodeListMap, copy=True)
        test_digraph = test_digraph.subgraph(tdl_nodes)
        test_digraph = nx.relabel_nodes(test_digraph, nodeListMap, copy=True)

    if (file_name):
        with open(file_name + "_train", 'w') as f:
            for (st, ed, w) in train_graph.edges(data='weight', default=1):
                f.write('%d %d %f\n' % (st, ed))

        with open(file_name + "_test", 'w') as f:
            for (st, ed, w) in test_graph.edges(data='weight', default=1):
                f.write('%d %d %f\n' % (st, ed))

    return (train_digraph, test_digraph)
示例#2
0
def evaluateStaticLinkPrediction(digraph,
                                 graph_embedding,
                                 train_ratio=0.8,
                                 n_sample_nodes=None,
                                 sample_ratio_e=None,
                                 no_python=False,
                                 is_undirected=True):
    node_num = digraph.number_of_nodes()
    # seperate train and test graph
    train_digraph, test_digraph = evaluation_util.splitDiGraphToTrainTest(
        digraph, train_ratio=train_ratio, is_undirected=is_undirected)
    if not nx.is_connected(train_digraph.to_undirected()):
        train_digraph = max(
            nx.weakly_connected_component_subgraphs(train_digraph), key=len)
        tdl_nodes = train_digraph.nodes()
        nodeListMap = dict(zip(tdl_nodes, range(len(tdl_nodes))))
        nx.relabel_nodes(train_digraph, nodeListMap, copy=False)
        test_digraph = test_digraph.subgraph(tdl_nodes)
        nx.relabel_nodes(test_digraph, nodeListMap, copy=False)

    # learning graph embedding
    X, _ = graph_embedding.learn_embedding(graph=train_digraph,
                                           no_python=no_python)
    node_l = None
    if n_sample_nodes:
        test_digraph, node_l = graph_util.sample_graph(test_digraph,
                                                       n_sample_nodes)
        X = X[node_l]

    # evaluation
    if sample_ratio_e:
        eval_edge_pairs = evaluation_util.getRandomEdgePairs(
            node_num, sample_ratio_e, is_undirected)
    else:
        eval_edge_pairs = None
    estimated_adj = graph_embedding.get_reconstructed_adj(X, node_l)
    predicted_edge_list = evaluation_util.getEdgeListFromAdjMtx(
        estimated_adj, is_undirected=is_undirected, edge_pairs=eval_edge_pairs)

    if node_l is None:
        node_l = list(range(train_digraph.number_of_nodes()))

    filtered_edge_list = [
        e for e in predicted_edge_list
        if not train_digraph.has_edge(node_l(e[0]), node_l(e[1]))
    ]

    MAP = metrics.computeMAP(filtered_edge_list, test_digraph)
    prec_curv, _ = metrics.computePrecisionCurve(filtered_edge_list,
                                                 test_digraph)
    return (MAP, prec_curv)
示例#3
0
def evaluate_supervised(di_graph, graph_embedding, is_undirected=True):

    train_digraph, test_digraph = train_test_split.splitDiGraphToTrainTest2(
        di_graph, train_ratio=0.6, is_undirected=True)
    train_digraph1, test_digraph = evaluation_util.splitDiGraphToTrainTest(
        test_digraph, train_ratio=0.5, is_undirected=is_undirected)

    X, _ = graph_embedding.learn_embedding(graph=train_digraph,
                                           no_python=False)

    trp, trn = create_edge_dataset(train_digraph, train_digraph1)
    trd, trl = create_vector_dataset(trp, trn, hadamard2, X)
    mean = np.mean(trd, axis=0)
    std = np.std(trd, axis=0)
    trd = (trd - mean) / std

    clasifier = train_classifier(trd, trl)

    for (st, ed) in train_digraph1.edges():
        train_digraph.add_edge(st, ed)

    sample_edges = sample_edge_new(train_digraph, test_digraph, 0.5)

    X, _ = graph_embedding.learn_embedding(graph=train_digraph,
                                           no_python=False)

    filtered_edge_list = getscore5(train_digraph, sample_edges, clasifier,
                                   hadamard2, X, mean, std)
    AP, ROC = scores.computeAP_ROC(filtered_edge_list, test_digraph)

    test_digraph, node_l = graph_util.sample_graph(test_digraph, 1024)
    X = X[node_l]
    estimated_adj = getscore2(train_digraph, node_l, clasifier, hadamard2, X,
                              mean, std)
    predicted_edge_list = evaluation_util.getEdgeListFromAdjMtx(
        estimated_adj, is_undirected=True)
    filtered_edge_list = [
        e for e in predicted_edge_list
        if not train_digraph.has_edge(node_l[e[0]], node_l[e[1]])
    ]
    MAP = scores.computeMAP(filtered_edge_list, test_digraph)

    print(MAP)

    return AP, ROC, MAP
示例#4
0
for grp in xrange(len(list_graphs)):
	for x in xrange(num_samples):

		# load the graph as a networkx graph

		G = graph_util.loadGraphFromEdgeListTxt(list_graphs[grp], directed=list_directed[grp])
		G = G.to_directed()
		
		if not os.path.exists('SAVER_SUP/'+fig_name[grp]+str(x+1)):
			os.makedirs('SAVER_SUP/'+fig_name[grp]+str(x+1))
		
		# split the graph into 60-20-20 ratio, 60% for calculating the edge features, 20% for training the classifier, 20% for evaluating the model.

		train_digraph, test_digraph = train_test_split.splitDiGraphToTrainTest2(G, train_ratio = 0.6, is_undirected=True)
		train_digraph1, test_digraph = evaluation_util.splitDiGraphToTrainTest(test_digraph, train_ratio=0.5, is_undirected=True)

		# embeddings without relearning

		print ("saving for LE")
		for dim in dimensions:
			embedding=LaplacianEigenmaps(d=dim)
			X, _ = embedding.learn_embedding(graph=train_digraph, no_python=False)
			file_name='SAVER_SUP/'+fig_name[grp]+str(x+1)+'/LE1_'+str(dim)
			parameter_file=open(file_name, 'wb')
			pickle.dump(X,parameter_file)
			parameter_file.close()

		print ("saving for DEEPWALK")
		for dim in dimensions:
			embedding=node2vec(d=dim, max_iter=1, walk_len=80, num_walks=10, con_size=10, ret_p=1, inout_p=1)
示例#5
0
def expLP(digraph, graph_embedding,
          n_sample_nodes_l, rounds,
          res_pre, m_summ, train_ratio=0.8,
          no_python=True, K=32768,
          is_undirected=True, sampling_scheme="u_rand"):
    print('\tLink Prediction')
    MAP = {}
    prec_curv = {}
    n_sample_nodes_l = [min(int(n), digraph.number_of_nodes()) for n in n_sample_nodes_l]

    # Randomly hide (1-train_ratio)*100% of links
    node_num = digraph.number_of_nodes()
    train_digraph, test_digraph = evaluation_util.splitDiGraphToTrainTest(
        digraph,
        train_ratio=train_ratio,
        is_undirected=is_undirected
    )

    # Ensure the resulting train subgraph is connected
    if not nx.is_connected(train_digraph.to_undirected()):
        train_digraph = max(
            nx.weakly_connected_component_subgraphs(train_digraph),
            key=len
        )
        tdl_nodes = train_digraph.nodes()
        nodeListMap = dict(zip(tdl_nodes, range(len(tdl_nodes))))
        train_digraph = nx.relabel_nodes(train_digraph, nodeListMap, copy=True)
        test_digraph = test_digraph.subgraph(tdl_nodes)
        ### unfroze the graph
        test_digraph = nx.Graph(test_digraph)
        ####nx.relabel_nodes(test_digraph, nodeListMap, copy=False)
        test_digraph = nx.relabel_nodes(test_digraph, nodeListMap, copy=True)
        
    pickle.dump(nodeListMap, open('gem/nodeListMap/lp_lcc.pickle', 'wb'))

    t1 = time()
    # learn graph embedding on train subgraph
    print(
        'Link Prediction train graph n_nodes: %d, n_edges: %d' % (
            train_digraph.number_of_nodes(),
            train_digraph.number_of_edges())
    )
    X, _ = graph_embedding.learn_embedding(
        graph=train_digraph,
        no_python=no_python
    )
    if X is not None and X.shape[0] != train_digraph.number_of_nodes():
        pdb.set_trace()
    print('Time taken to learn the embedding: %f sec' % (time() - t1))

    # sample test graph for evaluation and store results
    node_l = None
    if not n_sample_nodes_l:
        n_sample_nodes_l = [node_num]
    summ_file = open('%s_%s_%s.lpsumm' % (res_pre, m_summ, sampling_scheme), 'w')
    summ_file.write('Method\t%s\n' % metrics.getMetricsHeader())
    for n_s in n_sample_nodes_l:
        n_s = int(n_s)
        n_s = min(n_s, train_digraph.number_of_nodes())
        MAP[n_s] = [None] * rounds
        prec_curv[n_s] = [None] * rounds
        for round_id in range(rounds):
            if sampling_scheme == "u_rand":
                train_digraph_s, node_l = graph_util.sample_graph(
                    train_digraph,
                    n_s
                )
            else:
                train_digraph_s, node_l = graph_util.sample_graph_rw(
                    train_digraph,
                    n_s
                )
            if X is not None:
                X_sub = X[node_l]
            else:
                X_sub = None
            test_digraph_s = test_digraph.subgraph(node_l)
            nodeListMap = dict(zip(node_l, range(len(node_l))))
            pickle.dump(nodeListMap, open('gem/nodeListMap/lp_lcc_samp.pickle', 'wb'))
            test_digraph_s = nx.relabel_nodes(test_digraph_s, nodeListMap, copy=True)
            MAP[n_s][round_id], prec_curv[n_s][round_id] = \
                evaluateStaticLinkPrediction(train_digraph_s, test_digraph_s,
                                             graph_embedding, X_sub,
                                             node_l=node_l,
                                             is_undirected=is_undirected)
            prec_curv[n_s][round_id] = prec_curv[n_s][round_id][:K]
        summ_file.write('\tn_s:%d, %f/%f\t%s\n' % (
            n_s,
            np.mean(MAP[n_s]),
            np.std(MAP[n_s]),
            metrics.getPrecisionReport(
                prec_curv[n_s][0],
                len(prec_curv[n_s][0])
            )
        ))
    summ_file.close()
    #if len(prec_curv[-1][0]) < 100:
        #pdb.set_trace()
    pickle.dump([MAP, prec_curv, n_sample_nodes_l],
                open('%s_%s_%s_%s.lp' % (res_pre, m_summ, sampling_scheme, str(train_ratio)),
                     'wb'))
    print('Link prediction evaluation complete. Time: %f sec' % (time() - t1))
    # prec_curv2 = [p[4096] for p in prec_curv[prec_curv.keys()[0]]]
    return MAP[list(MAP.keys())[0]]  # prec_curv2
示例#6
0
def evaluateStaticLinkPrediction(digraph, graph_embedding,
                                 train_ratio=0.8,
                                 n_sample_nodes=None,
                                 sample_ratio_e=None,
                                 no_python=False,
                                 is_undirected=True):
    node_num = digraph.number_of_nodes()
    print('eslp graph')
    print(digraph.edges()[:3])
    # seperate train and test graph
    train_digraph, test_digraph = evaluation_util.splitDiGraphToTrainTest(
        digraph,
        train_ratio=train_ratio,
        is_undirected=is_undirected
    )
    print('eslp training graph')
    print(train_digraph.edges()[:3])
    if not nx.is_connected(train_digraph.to_undirected()):
        train_digraph = max(
            nx.weakly_connected_component_subgraphs(train_digraph),
            key=len
        )
        tdl_nodes = train_digraph.nodes()
        nodeListMap = dict(zip(tdl_nodes, range(len(tdl_nodes))))
        reversedNodeListMap = dict(zip(range(len(tdl_nodes)),tdl_nodes))
        print(nodeListMap)
        nx.relabel_nodes(train_digraph, nodeListMap, copy=False)
        test_digraph = test_digraph.subgraph(tdl_nodes)
        nx.relabel_nodes(test_digraph, nodeListMap, copy=False)
    else:
        reversedNodeListMap = dict(zip(tdl_nodes,tdl_nodes))

    print('elsp training graph after largest cc')
    print(train_digraph.edges()[:3])
    # learning graph embedding
    X, _ = graph_embedding.learn_embedding(
        graph=train_digraph,
        no_python=no_python
    )
    node_l = None
    if n_sample_nodes:
        test_digraph, node_l = graph_util.sample_graph(
            test_digraph,
            n_sample_nodes
        )
        X = X[node_l]

    # print('len graph edges')
    # print(len(graph.nodes()))
    # print('embedding vectors number')
    # print(len(self._X))
    node2vec_dict = {}
    print('GUESS embedding node2vc train result')
    for i in range(len(X)):
        node2vec_dict[reversedNodeListMap[train_digraph.nodes()[i]]] = X[i]
        # print(str(train_digraph.nodes()[i])+" "+str(reversedNodeListMap[train_digraph.nodes()[i]]) + " "+ str(X[i]))
    # evaluation
    if sample_ratio_e:
        eval_edge_pairs = evaluation_util.getRandomEdgePairs(
            node_num,
            sample_ratio_e,
            is_undirected
        )
    else:
        eval_edge_pairs = None
    estimated_adj = graph_embedding.get_reconstructed_adj(X, node_l)
    predicted_edge_list = evaluation_util.getEdgeListFromAdjMtx(
        estimated_adj,
        is_undirected=is_undirected,
        edge_pairs=eval_edge_pairs
    )
    if node_l is None:
        node_l = list(range(train_digraph.number_of_nodes()))
    filtered_edge_list = [e for e in predicted_edge_list if not train_digraph.has_edge(node_l[e[0]], node_l[e[1]])]
    MAP = metrics.computeMAP(filtered_edge_list, test_digraph)
    prec_curv, _ = metrics.computePrecisionCurve(
        filtered_edge_list,
        test_digraph
    )
    return (MAP, prec_curv, node2vec_dict)
示例#7
0
def evaluate_supervised_new(train_digraph,
                            embeddings,
                            hads,
                            is_undirected=True):

    train_digraph, test_digraph = train_test_split.splitDiGraphToTrainTest2(
        train_digraph, train_ratio=0.6, is_undirected=True)
    for (st, ed) in train_digraph.edges():
        if (test_digraph.has_edge(st, ed)):
            test_digraph.remove_edge(st, ed)

    train_digraph1, test_digraph = evaluation_util.splitDiGraphToTrainTest(
        test_digraph, train_ratio=0.5, is_undirected=is_undirected)

    l_emb = []
    combine = []
    for emb in embeddings:
        X, _ = emb.learn_embedding(graph=train_digraph, no_python=False)
        l_emb.append(X)

    for had in hads:
        if (had == 1):
            combine.append(hadamard1)
        elif (had == 0):
            combine.append(hadamard2)

    # combine.append(dotp1)

    print("embeddings learned")

    trp, trn = create_edge_dataset(train_digraph, train_digraph1)
    trd, trl = create_mix_dataset(trp, trn, train_digraph, l_emb, combine)
    mean = np.mean(trd, axis=0)
    std = np.std(trd, axis=0)
    trd = (trd - mean) / std
    clasifier = train_classifier(trd, trl)
    # print (clasifier.coef_)
    # print (clasifier.intercept_)

    train_digraph_temp = train_digraph.copy()
    for (st, ed) in train_digraph1.edges():
        train_digraph_temp.add_edge(st, ed)

    sample_edges = sample_edge_new(train_digraph_temp,
                                   test_digraph,
                                   -1,
                                   num_edges=500000)

    # co=0
    # for (st,ed) in sample_edges:
    #     for (st1,ed1) in trn:
    #         if(st==st1 and ed==ed1):
    #             if(test_digraph.has_edge(st,ed)):
    #                 print ("1")

    #     for (st1,ed1) in trp:
    #         if(st==st1 and ed==ed1):
    #             if(test_digraph.has_edge(st,ed)):
    #                 print ("2")
    #             else:
    #                 print ("3")

    # l_emb1 = []
    # for emb in embeddings:
    #     X, _ = emb.learn_embedding(graph=train_digraph_temp, no_python=False)
    #     l_emb1.append(X)
    #     break
    # l_emb1.append(l_emb[1])

    print("embeddings learned")

    # filtered_edge_list = getscore9(train_digraph, sample_edges, clasifier, l_emb1, combine, mean, std)
    # AP, ROC = scores.computeAP_ROC(filtered_edge_list, test_digraph)
    # print (AP,ROC)

    filtered_edge_list = getscore9(train_digraph_temp, sample_edges, clasifier,
                                   l_emb, combine, mean, std)
    AP, ROC = scores.computeAP_ROC(filtered_edge_list, test_digraph)
    print(AP, ROC)

    trd, trl = create_score_dataset(trp, trn, allh, train_digraph)
    mean = np.mean(trd, axis=0)
    std = np.std(trd, axis=0)
    trd = (trd - mean) / std
    clasifier = train_classifier(trd, trl)
    filtered_edge_list = getscore7(train_digraph_temp, sample_edges, clasifier,
                                   allh, mean, std)
    AP2, ROC2 = scores.computeAP_ROC(filtered_edge_list, test_digraph)
    print(AP2, ROC2)

    # G11 = train_digraph.to_undirected()
    # f1=[]
    # f2=[]
    # for (st,ed,w) in filtered_edge_list:
    #     f1.append(w)
    #     f2.append(cn(G11,st,ed))

    # f1=np.array(f1)
    # f2=np.array(f2)
    # ind1 = np.argsort(-1*f1)
    # ind2 = np.argsort(-1*f2)
    # print (ind1[:1000])
    # print (ind2[:1000])
    # print (f1[ind1[:1000]])
    # print (f2[ind1[:1000]])

    # filtered_edge_list = getscore3(train_digraph_temp, sample_edges, aa)
    # AP, ROC = scores.computeAP_ROC(filtered_edge_list, test_digraph)
    # print (AP,ROC)

    return AP, ROC

    # labels=[]
    # score=[]
    # dist=[]
    # G=train_digraph.to_undirected()
    # print (len(filtered_edge_list))
    # for (st,ed,w) in filtered_edge_list:
    #     # if not(nx.shortest_path_length(G,source=st,target=ed)==2):
    #     #     continue
    #     if(test_digraph.has_edge(st,ed)):
    #         labels.append(1)
    #     else:
    #         labels.append(0)
    #     score.append(w)
    # ap = average_precision_score(labels, score)
    # print (ap)

    # ind = np.argsort(-1*np.asarray(score))
    # labels = np.array(labels)
    # print (labels[ind[:1000]])

    # labels=[]
    # score=[]
    # dist=[]
    # G=train_digraph.to_undirected()
    # for (st,ed,w) in filtered_edge_list:
    # if (nx.shortest_path_length(G,source=st,target=ed)==2):
    #     continue
    #     if(test_digraph.has_edge(st,ed)):
    #         labels.append(1)
    #     else:
    #         labels.append(0)
    #     score.append(w)
    # ap = average_precision_score(labels, score)
    # print (ap)

    # ind = np.argsort(-1*np.asarray(score))
    # labels = np.array(labels)
    # print (labels[ind[:1000]])

    # test_digraph, node_l = graph_util.sample_graph(test_digraph, 1024)
    # estimated_adj = getscore8(train_digraph, node_l, clasifier, l_emb1, combine)
    # predicted_edge_list = evaluation_util.getEdgeListFromAdjMtx(estimated_adj,is_undirected=True)
    # filtered_edge_list = [e for e in predicted_edge_list if not train_digraph.has_edge(node_l[e[0]], node_l[e[1]])]
    # MAP = scores.computeMAP(filtered_edge_list, test_digraph)

    # print (MAP)
    MAP = 0

    return AP, ROC, MAP