def test_non_edges(self): # All possible edges exist graph = nx.complete_graph(5) nedges = list(nx.non_edges(graph)) assert_equal(len(nedges), 0) graph = nx.path_graph(4) expected = [(0, 2), (0, 3), (1, 3)] nedges = list(nx.non_edges(graph)) for (u, v) in expected: assert_true((u, v) in nedges or (v, u) in nedges) graph = nx.star_graph(4) expected = [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)] nedges = list(nx.non_edges(graph)) for (u, v) in expected: assert_true((u, v) in nedges or (v, u) in nedges) # Directed graphs graph = nx.DiGraph() graph.add_edges_from([(0, 2), (2, 0), (2, 1)]) expected = [(0, 1), (1, 0), (1, 2)] nedges = list(nx.non_edges(graph)) for e in expected: assert_true(e in nedges)
def __init__(self,infile, readFileLite = False): # create a new graph G = nx.DiGraph() self.allParts = [] self.partsManager = partsManager() self.infile = infile if(self.infile): self.readFile(self.infile,readFileLite); # edge to feature list # add all nodes and edges G.add_nodes_from(range(self.n+1)) G.add_edges_from(nx.non_edges(G)) # remove incoming edges to 0 for node in range(1,self.n+1): G.remove_edge(node, 0) # remove pruned edges G = self.removePrunedEdges(G); # save graph self.graph = G
def jaccard_predictions(G): """ Create a ranked list of possible new links based on the Jaccard similarity, defined as the intersection of nodes divided by the union of nodes parameters G: Directed or undirected nx graph returns list of linkbunches with the score as an attribute """ potential_edges = [] G_undirected = nx.Graph(G) for non_edge in nx.non_edges(G_undirected): u = set(G.neighbors(non_edge[0])) v = set(G.neighbors(non_edge[1])) uv_un = len(u.union(v)) uv_int = len(u.intersection(v)) if uv_int == 0 or uv_un == 0: continue else: s = (1.0*uv_int)/uv_un potential_edges.append(non_edge + ({'score': s},)) return potential_edges
def fuzz_network(G_orig, threshold, b, edge_frac=1.0, nonedge_mult=5.0): G = G_orig.copy() n = len(G.nodes()) H = Graph() H.add_nodes_from(range(n)) pairs = n * (n - 1) / 2 actual_edges = len(G.edges()) edges = int(edge_frac * actual_edges) nonedges = int(edges * nonedge_mult) a = b / nonedge_mult # though these distributions are normalized to one, by selecting the appropriate number of edges # and nonedges, we make these 'distributions' correct edge_probs = np.random.beta(a + 1, b, edges) nonedge_probs = np.random.beta(a, b + 1, nonedges) # picking the right number of edges from the appropriate list edge_list = G.edges() nonedge_list = list(non_edges(G)) shuffle(edge_list) shuffle(nonedge_list) for i in range(len(edge_probs)): G[edge_list[i][0]][edge_list[i][1]]["weight"] = edge_probs[i] if edge_probs[i] > threshold: H.add_edge(edge_list[i][0], edge_list[i][1]) for i in range(len(nonedge_probs)): G.add_edge(nonedge_list[i][0], nonedge_list[i][1], weight=nonedge_probs[i]) if nonedge_probs[i] > threshold: H.add_edge(nonedge_list[i][0], nonedge_list[i][1]) return G, H
def add_remove_random_edges(G, pct_add, pct_remove): """Randomly add edges to and remove edges from G Parameters ---------- G : a networkx.Graph the network pct_add : float A percentage (between 0 and 1) pct_remove : float A percentage (between 0 and 1) """ assert_is_percentage(pct_add) assert_is_percentage(pct_remove) edges = G.edges() m = len(edges) to_add = int(m * pct_add) to_remove = int(m * pct_remove) log.debug("Will add %d (%f) edges to and remove %d (%f) edges of %d", to_add, pct_add, to_remove, pct_remove, m) new_edges = set(nx.non_edges(G)) G.remove_edges_from(random.sample(edges, to_remove)) G.add_edges_from(random.sample(new_edges, to_add))
def show_graph(g, vertex_color='typeof', size=15, vertex_label=None): """show_graph.""" degrees = [len(g.neighbors(u)) for u in g.nodes()] print(('num nodes=%d' % len(g))) print(('num edges=%d' % len(g.edges()))) print(('num non edges=%d' % len(list(nx.non_edges(g))))) print(('max degree=%d' % max(degrees))) print(('median degree=%d' % np.percentile(degrees, 50))) draw_graph(g, size=size, vertex_color=vertex_color, vertex_label=vertex_label, vertex_size=200, edge_label=None) # display degree distribution size = int((max(degrees) - min(degrees)) / 1.5) plt.figure(figsize=(size, 3)) plt.title('Degree distribution') _bins = np.arange(min(degrees), max(degrees) + 2) - .5 n, bins, patches = plt.hist(degrees, _bins, alpha=0.3, facecolor='navy', histtype='bar', rwidth=0.8, edgecolor='k') labels = np.array([str(int(i)) for i in n]) for xi, yi, label in zip(bins, n, labels): plt.text(xi + 0.5, yi, label, ha='center', va='bottom') plt.xticks(bins + 0.5) plt.xlim((min(degrees) - 1, max(degrees) + 1)) plt.ylim((0, max(n) * 1.1)) plt.xlabel('Node degree') plt.ylabel('Counts') plt.grid(linestyle=":") plt.show()
def adamic_adar_index(G, ebunch=None): if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): return sum(1 / math.log(G.degree(w)) for w in nx.common_neighbors(G, u, v)) return ((u, v, predict(u, v)) for u, v in ebunch)
def common_neighbor(G, ebunch=None): if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): cnbors = list(nx.common_neighbors(G, u, v)) return len(cnbors) return ((u, v, predict(u, v)) for u, v in ebunch)
def get_unknown_edges(_G): _unknown = list() _edges = nx.non_edges(_G) for e in _edges: _unknown.append(e) return _unknown
def make_train_test_set(graph, radius, test_proportion=.3, ratio_neg_to_pos=10): """make_train_test_set.""" pos = [(u, v) for u, v in graph.edges()] neg = [(u, v) for u, v in nx.non_edges(graph)] random.shuffle(pos) random.shuffle(neg) pos_dim = len(pos) neg_dim = len(neg) max_n_neg = min(pos_dim * ratio_neg_to_pos, neg_dim) neg = neg[:max_n_neg] neg_dim = len(neg) tr_pos = pos[:-int(pos_dim * test_proportion)] te_pos = pos[-int(pos_dim * test_proportion):] tr_neg = neg[:-int(neg_dim * test_proportion)] te_neg = neg[-int(neg_dim * test_proportion):] # remove edges tr_graph = graph.copy() tr_graph.remove_edges_from(te_pos) tr_pos_graphs = list(_make_subgraph_set(tr_graph, radius, tr_pos)) tr_neg_graphs = list(_make_subgraph_set(tr_graph, radius, tr_neg)) te_pos_graphs = list(_make_subgraph_set(tr_graph, radius, te_pos)) te_neg_graphs = list(_make_subgraph_set(tr_graph, radius, te_neg)) tr_graphs = tr_pos_graphs + tr_neg_graphs te_graphs = te_pos_graphs + te_neg_graphs tr_targets = [1] * len(tr_pos_graphs) + [0] * len(tr_neg_graphs) te_targets = [1] * len(te_pos_graphs) + [0] * len(te_neg_graphs) tr_graphs, tr_targets = paired_shuffle(tr_graphs, tr_targets) te_graphs, te_targets = paired_shuffle(te_graphs, te_targets) return (tr_graphs, np.array(tr_targets)), (te_graphs, np.array(te_targets))
def common_neighbors(G, fn, t = 0.5): G = G.to_undirected() if os.path.isfile(fn) : H = G.copy() found = nx.read_edgelist(fn, nodetype=int, data=False) H.add_edges_from(found.edges_iter()) jacc_iter = nx.jaccard_coefficient(G, nx.non_edges(H)) print "Appending to %s" % fn outfile = open(fn,'a',1) i = found.number_of_nodes() else: jacc_iter = nx.jaccard_coefficient(G) outfile = open(fn,'w',1) i = 0 outfile.write("#vertex u; vertex v; their jaccard coef\n") cur = -1 print "Starting jacc loop %s with threshold %s" % (time.strftime("%H:%M:%S"), t) for pair in jacc_iter: if pair[2] >= t: outfile.write("%s %s %f\n" % (pair[0],pair[1],pair[2])) if pair[0] != cur: cur = pair[0] i += 1 print "%s: %s" % (i, cur) outfile.close() print "Done writing %s" % (fn)
def jaccard_coefficient(G, ebunch=None): r"""Compute the Jaccard coefficient of all node pairs in ebunch. Jaccard coefficient of nodes `u` and `v` is defined as .. math:: \frac{|\Gamma(u) \cap \Gamma(v)|}{|\Gamma(u) \cup \Gamma(v)|} where :math:`\Gamma(u)` denotes the set of neighbors of `u`. Parameters ---------- G : graph A NetworkX undirected graph. ebunch : iterable of node pairs, optional (default = None) Jaccard coefficient will be computed for each pair of nodes given in the iterable. The pairs must be given as 2-tuples (u, v) where u and v are nodes in the graph. If ebunch is None then all non-existent edges in the graph will be used. Default value: None. Returns ------- piter : iterator An iterator of 3-tuples in the form (u, v, p) where (u, v) is a pair of nodes and p is their Jaccard coefficient. Examples -------- >>> import networkx as nx >>> G = nx.complete_graph(5) >>> preds = nx.jaccard_coefficient(G, [(0, 1), (2, 3)]) >>> for u, v, p in preds: ... '(%d, %d) -> %.8f' % (u, v, p) ... '(0, 1) -> 0.60000000' '(2, 3) -> 0.60000000' References ---------- .. [1] D. Liben-Nowell, J. Kleinberg. The Link Prediction Problem for Social Networks (2004). http://www.cs.cornell.edu/home/kleinber/link-pred.pdf """ if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): cnbors = list(nx.common_neighbors(G, u, v)) union_size = len(set(G[u]) | set(G[v])) if union_size == 0: return 0 else: return len(cnbors) / union_size return ((u, v, predict(u, v)) for u, v in ebunch)
def graph_distance(G, ebunch=None): if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): if(nx.has_path(G, u, v)): s_path_length = nx.shortest_path_length(G, source = u, target = v) return (-1) * s_path_length else: return -100 return ((u, v, predict(u, v)) for u, v in ebunch)
def resource_allocation_index(G, ebunch=None): r"""Compute the resource allocation index of all node pairs in ebunch. Resource allocation index of `u` and `v` is defined as .. math:: \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{|\Gamma(w)|} where :math:`\Gamma(u)` denotes the set of neighbors of `u`. Parameters ---------- G : graph A NetworkX undirected graph. ebunch : iterable of node pairs, optional (default = None) Resource allocation index will be computed for each pair of nodes given in the iterable. The pairs must be given as 2-tuples (u, v) where u and v are nodes in the graph. If ebunch is None then all non-existent edges in the graph will be used. Default value: None. Returns ------- piter : iterator An iterator of 3-tuples in the form (u, v, p) where (u, v) is a pair of nodes and p is their resource allocation index. Examples -------- >>> import networkx as nx >>> G = nx.complete_graph(5) >>> preds = nx.resource_allocation_index(G, [(0, 1), (2, 3)]) >>> for u, v, p in preds: ... '(%d, %d) -> %.8f' % (u, v, p) ... '(0, 1) -> 0.75000000' '(2, 3) -> 0.75000000' References ---------- .. [1] T. Zhou, L. Lu, Y.-C. Zhang. Predicting missing links via local information. Eur. Phys. J. B 71 (2009) 623. http://arxiv.org/pdf/0901.0553.pdf """ if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): return sum(1 / G.degree(w) for w in nx.common_neighbors(G, u, v)) return ((u, v, predict(u, v)) for u, v in ebunch)
def cosine_similarity(G, ebunch=None): if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): cnbors = list(nx.common_neighbors(G, u, v)) cosine_val = math.sqrt(G.degree(u) * G.degree(v)) if cosine_val == 0: return 0 else: return len(cnbors) / cosine_val return ((u, v, predict(u, v)) for u, v in ebunch)
def adamic_adar_index(G, ebunch=None): r"""Compute the Adamic-Adar index of all node pairs in ebunch. Adamic-Adar index of `u` and `v` is defined as .. math:: \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{\log |\Gamma(w)|} where :math:`\Gamma(u)` denotes the set of neighbors of `u`. Parameters ---------- G : graph NetworkX undirected graph. ebunch : iterable of node pairs, optional (default = None) Adamic-Adar index will be computed for each pair of nodes given in the iterable. The pairs must be given as 2-tuples (u, v) where u and v are nodes in the graph. If ebunch is None then all non-existent edges in the graph will be used. Default value: None. Returns ------- piter : iterator An iterator of 3-tuples in the form (u, v, p) where (u, v) is a pair of nodes and p is their Adamic-Adar index. Examples -------- >>> import networkx as nx >>> G = nx.complete_graph(5) >>> preds = nx.adamic_adar_index(G, [(0, 1), (2, 3)]) >>> for u, v, p in preds: ... '(%d, %d) -> %.8f' % (u, v, p) ... '(0, 1) -> 2.16404256' '(2, 3) -> 2.16404256' References ---------- .. [1] D. Liben-Nowell, J. Kleinberg. The Link Prediction Problem for Social Networks (2004). http://www.cs.cornell.edu/home/kleinber/link-pred.pdf """ if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): return sum(1 / math.log(G.degree(w)) for w in nx.common_neighbors(G, u, v)) return ((u, v, predict(u, v)) for u, v in ebunch)
def lhn(G, ebunch=None): if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): cnbors = list(nx.common_neighbors(G, u, v)) mult_val = G.degree(u) * G.degree(v) if mult_val == 0: return 0 else: return len(cnbors)/ mult_val return ((u, v, predict(u, v)) for u, v in ebunch)
def hdi(G, ebunch=None): if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): cnbors = list(nx.common_neighbors(G, u, v)) max_val = max(G.degree(u), G.degree(v)) if max_val == 0: return 0 else: return len(cnbors) / max_val return ((u, v, predict(u, v)) for u, v in ebunch)
def sorensen(G, ebunch=None): if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): cnbors_len = len(list(nx.common_neighbors(G, u, v))) denomi = G.degree(u) + G.degree(v) if denomi == 0: return 0 else: return (2*cnbors_len) / denomi return ((u, v, predict(u, v)) for u, v in ebunch)
def jaccard_coefficient(G, ebunch=None): if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): cnbors = list(nx.common_neighbors(G, u, v)) union_size = len(set(G[u]) | set(G[v])) if union_size == 0: return 0 else: return len(cnbors) / union_size return ((u, v, predict(u, v)) for u, v in ebunch)
def resource_allocation_index(G, ebunch=None): if ebunch is None: ebunch = nx.non_edges(G) def predict(u, v): cnbors = list(nx.common_neighbors(G, u, v)) sum_cn = 0 for w in cnbors: if not G.degree(w) == 0: #print("debug") sum_cn += 1/math.fabs(G.degree(w)) return sum_cn return ((u, v, predict(u, v)) for u, v in ebunch)
def excluded(self): """Get set of links that should not be predicted""" exclude = self.config['exclude'] if not exclude: return set() # No nodes are excluded elif exclude == 'old': return set(self.training.edges_iter()) elif exclude == 'new': return set(nx.non_edges(self.training)) raise LinkPredError("Value '{}' for exclude is unexpected. Use either " "'old', 'new' or empty string '' (for no " "exclusions)".format(exclude))
def jaccard_mp_predictions(G): """ Create a ranked list of possible new links based on the Jaccard similarity, defined as the intersection of nodes divided by the union of nodes parameters G: Directed or undirected nx graph returns list of linkbunches with the score as an attribute """ pool = mp.Pool(processes=4) G_undirected = nx.Graph(G) results = pool.map(jaccard_prediction, nx.non_edges(G_undirected)) return results
def preferential_attachment(G, ebunch=None): r"""Compute the preferential attachment score of all node pairs in ebunch. Preferential attachment score of `u` and `v` is defined as .. math:: |\Gamma(u)| |\Gamma(v)| where :math:`\Gamma(u)` denotes the set of neighbors of `u`. Parameters ---------- G : graph NetworkX undirected graph. ebunch : iterable of node pairs, optional (default = None) Preferential attachment score will be computed for each pair of nodes given in the iterable. The pairs must be given as 2-tuples (u, v) where u and v are nodes in the graph. If ebunch is None then all non-existent edges in the graph will be used. Default value: None. Returns ------- piter : iterator An iterator of 3-tuples in the form (u, v, p) where (u, v) is a pair of nodes and p is their preferential attachment score. Examples -------- >>> import networkx as nx >>> G = nx.complete_graph(5) >>> preds = nx.preferential_attachment(G, [(0, 1), (2, 3)]) >>> for u, v, p in preds: ... '(%d, %d) -> %d' % (u, v, p) ... '(0, 1) -> 16' '(2, 3) -> 16' References ---------- .. [1] D. Liben-Nowell, J. Kleinberg. The Link Prediction Problem for Social Networks (2004). http://www.cs.cornell.edu/home/kleinber/link-pred.pdf """ if ebunch is None: ebunch = nx.non_edges(G) return ((u, v, G.degree(u) * G.degree(v)) for u, v in ebunch)
def randomAnony(g, k, *li): """Delete and add k nodes from g""" import random if g.number_of_edges() >= k: delEdges = random.sample(g.edges(), k) outStr = "Randomly delete " + str(k) + " edges:" + "\n" + str(delEdges) + "\n" g.remove_edges_from(delEdges) noEdges = list(nx.non_edges(g)) # This is an inefficient methond!!! if len(noEdges) > k: addEdges = random.sample(noEdges, k) g.add_edges_from(addEdges) outStr = outStr + "Randomly add " + str(k) + " edges:" + "\n" + str(addEdges) + "\n" if li: # Display the del/add edges on TxtCtr sc = li[0] sc.SetValue(outStr)
def Prediction_Experiment(G, Predictor, Probe_Set, Top_L, Deleted_Ratio): print "Prediction_Experiment!" #Get Evaluation Link Set-------- #Top_L = (G.number_of_edges() - 0) / Top_k #The top proportion 1/Top_k of edges are considered #Probe_Set = Probe_Set_Correspond_Training(G, Top_L, fpname) #****Get the probe set for evaluation***** #Get Ranking List with different deleted links ratio---------- Edge_Num = float(G.number_of_edges()) '''AUC = Performance_Evaluation_AUC(Predictor, G, Probe_Set)''' Unobserved_links = nx.non_edges(G) Non_existing_links = list(set(Unobserved_links).difference(set(Probe_Set))) AUC = Performance_Evaluation_AUC(Predictor, G, Probe_Set, Non_existing_links) Rank_List_Set = Prediction_LinkScores_Ratio(G, Predictor, Deleted_Ratio, 50, 30) #Prediction_LinkScores_Ratio(G, Predictor, Proportion, Toleration, Predict_Gap) #----Performance Evaluation with Precision under different Training Data Ratio---- Precision_Set = [] X_Set = [] Coefficient_Set = [] Avg_PathLen_Set = [] for key in sorted(Rank_List_Set.keys()): Rank_List_Sorted = sorted(Rank_List_Set[key][0], key=lambda edge: edge[2], reverse=True) Top_L_Rank_List = Rank_List_Sorted[0:Top_L] Coefficient_Set.append(Rank_List_Set[key][1]) Avg_PathLen_Set.append(Rank_List_Set[key][2]) #AUC_Set.append(Rank_List_Set[key][3]) #print key, Performance_Evaluation_Precision(Top_L_Rank_List, Probe_Set) X_Set.append(float(key)/Edge_Num) Precision_Set.append(Performance_Evaluation_Precision(Top_L_Rank_List, Probe_Set)) ''' #Draw Curve Graph if key%100 == 0: data = [] for edge in Rank_List_Sorted: data.append(edge[2]) matploit(data) ''' #end for print "*Different deleted links ratio:", X_Set print "*Precision_Set with different deleted links ratio:", Precision_Set print "*Coefficient_Set:", Coefficient_Set print "*Avg_PathLen_Set:", Avg_PathLen_Set print "*AUC Value:", AUC return 1
def add_random_edges(G, pct): """Add `n` random edges to G (`n` = fraction of current edge count) Parameters ---------- G : a networkx.Graph the network pct : float A percentage (between 0 and 1) """ assert_is_percentage(pct) m = G.size() to_add = int(m * pct) log.debug("Will add %d edges to %d (%f)", to_add, m, pct) new_edges = set(nx.non_edges(G)) G.add_edges_from(random.sample(new_edges, to_add), weight=1)
def _apply_prediction(G, func, ebunch=None): """Applies the given function to each edge in the specified iterable of edges. `G` is an instance of :class:`networkx.Graph`. `func` is a function on two inputs, each of which is a node in the graph. The function can return anything, but it should return a value representing a prediction of the likelihood of a "link" joining the two nodes. `ebunch` is an iterable of pairs of nodes. If not specified, all non-edges in the graph `G` will be used. """ if ebunch is None: ebunch = nx.non_edges(G) return ((u, v, func(u, v)) for u, v in ebunch)
def Drift_Prediction_Experiment(G, Predictor, Probe_Set, Top_L, Deleted_Ratio): print "Drift_Prediction_Experiment!" Edge_Num = float(G.number_of_edges()) #AUC = Performance_Evaluation_AUC(Predictor, G, Probe_Set) Unobserved_links = nx.non_edges(G) #Unobserved_links = list(Unobserved_links) #print Unobserved_links #print Probe_Set Non_existing_links = list(set(Unobserved_links).difference(set(Probe_Set))) AUC = Performance_Evaluation_AUC(Predictor, G, Probe_Set, Non_existing_links) #***Prediction with different training set proportion*** t1 = time.time() Rank_List_Set = Prediction_LinkScores_Ratio(G, Predictor, Deleted_Ratio, 50, 30) #Prediction_LinkScores_Ratio(G, Predictor, Proportion, Toleration, Predict_Gap) t2 = time.time() print "Prediction index time",t2-t1 #----Performance Evaluation with Precision under different Training Data Ratio---- Precision_Set = [] X_Set = [] Coefficient_Set = [] Avg_PathLen_Set = [] for key in sorted(Rank_List_Set.keys()): Rank_List_Sorted = sorted(Rank_List_Set[key][0], key=lambda edge: edge[2], reverse=True) Top_L_Rank_List = Rank_List_Sorted[0:Top_L] Coefficient_Set.append(Rank_List_Set[key][1]) Avg_PathLen_Set.append(Rank_List_Set[key][2]) X_Set.append(float(key)/Edge_Num) Precision_Set.append(Performance_Evaluation_Precision(Top_L_Rank_List, Probe_Set)) #end for print "*Drift_Different deleted links ratio:", X_Set print "*Drift_Precision_Set with different deleted links ratio:", Precision_Set print "*Drift_Coefficient_Set:", Coefficient_Set print "*Drift_Avg_PathLen_Set:", Avg_PathLen_Set print "*Drift_AUC Value:", AUC return 1
def jaccard_predictions(G): """ Create a ranked list of possible new links based on the Jaccard similarity, defined as the intersection of nodes divided by the union of nodes parameters G: Directed or undirected nx graph returns list of linkbunches with the score as an attribute """ potential_edges = [] for non_edge in nx.non_edges(G): u = set(G.neighbors(non_edge[0])) v = set(G.neighbors(non_edge[1])) if len(u.union(v)) == 0: s = 0.0 else: s = (1.0*len(u.intersection(v)))/len(u.union(v)) non_edge = non_edge + ({'score': s},) potential_edges.append(non_edge) return potential_edges
def prediction(self): highest_betweenness = dict() if not self.betweeness_value: for community in self.communities: print("Getting betweenness for community {}".format(community)) subgraph = nx.subgraph(self.graph, self.communities[community]) highest_betweenness[ community] = self._LinkWithBetweenness__get_betweenness( subgraph) print("Betweenness done") write_dict_to_json(highest_betweenness, self.filename, "../betweenness/") print("Betweenness values written") else: print("Betweenness provided") highest_betweenness = self.betweeness_value highest_betweenness_left = highest_betweenness["0"] highest_betweenness_right = highest_betweenness["1"] non_connected_nodes = list(nx.non_edges(self.graph)) n_possible_new_connections = len(non_connected_nodes) non_connected_nodes = list( filter( lambda x: (x[0] in highest_betweenness_right and x[1] in highest_betweenness_left) or (x[0] in highest_betweenness_left and x[1] in highest_betweenness_right), non_connected_nodes)) ranked_betweenness_nodes = self._LinkWithBetweenness__get_highest_betweenness( non_connected_nodes, highest_betweenness_left, highest_betweenness_right) if not self.values: algorithm = None print("Combining betweenness with {}".format( self.algorithm.lower())) if self.algorithm.upper() == TypeOfAlgorithm.ADAMIC_ADAR.value: algorithm = nx.adamic_adar_index elif self.algorithm == TypeOfAlgorithm.JACCARD_COEFFICIENT.value: algorithm = nx.jaccard_coefficient elif self.algorithm == TypeOfAlgorithm.RESOURCE_ALLOCATION.value: algorithm = nx.resource_allocation_index elif self.algorithm == TypeOfAlgorithm.PREFERENTIAL_ATTACHMENT.value: algorithm = nx.preferential_attachment ranked_similarity_nodes = list( sorted(algorithm(self.graph, non_connected_nodes), key=lambda element: element[2], reverse=True)) write_dict_to_json({"values": ranked_similarity_nodes}, self.filename, f"../{self.algorithm.lower()}/") print(f"{self.algorithm} values written") else: print(f"{self.algorithm.lower()} provided") ranked_similarity_nodes = list(self.values.values())[0] ranked_similarity_nodes = list(map(tuple, ranked_similarity_nodes)) scores = self.__combine_scores(ranked_betweenness_nodes, ranked_similarity_nodes) scores = { k: v for k, v in sorted( scores.items(), key=lambda item: item[1], reverse=True) } max_links = len(scores) number_edges = round(self.k, self.n_edges) all_possible_new_edges = scores.keys() if number_edges < max_links: edges_to_add = islice(all_possible_new_edges, number_edges) else: edges_to_add = all_possible_new_edges self.percentage_edges_added = self.k print("% of edges added: {}".format(self.percentage_edges_added)) print("Adding {} edges".format(number_edges)) for edge in edges_to_add: self.link_nodes(edge[0], edge[1])
def MI3(G): #G = nx.read_edgelist(graph_file) #G = nx.read_edgelist(graph_file, nodetype=int)# 将点类型改为int,使点标示和序号对应 node_num = nx.number_of_nodes(G) edge_num = nx.number_of_edges(G) nodes = nx.nodes(G) beta = -math.log2(0.0001) # 首先计算$P(L^1_{xy})$,其实不需要计算顶点对之间的概率,只需要不同度之间的概率 nodes_Degree_dict = {} degree_list = [] for v in nodes: nodes_Degree_dict[v] = nx.degree(G, v) degree_list.append(nx.degree(G, v)) #degree_list = [nx.degree(G, v) for v in range(node_num)]#序号和点的值一一对应 distinct_degree_list = list(set(degree_list)) size = len(distinct_degree_list) self_Connect_dict = {} for x in range(size): k_x = distinct_degree_list[x] for y in range(x, size): k_y = distinct_degree_list[y] p0 = 1 (k_n, k_m) = pair(k_x, k_y) a = edge_num + 1 b = edge_num - k_m + 1 for i in range(1, k_n + 1): p0 *= (b - i) / (a - i) # end for if p0 == 1: self_Connect_dict[(k_n, k_m)] = beta self_Connect_dict[(k_m, k_n)] = beta else: self_Connect_dict[(k_n, k_m)] = -math.log2(1 - p0) self_Connect_dict[(k_m, k_n)] = -math.log2(1 - p0) # 计算以z为公共邻居的两个顶点间存在链接的互信息 #mutual_info_list = [0 for z in range(node_num)] self_Conditional_dict = {} for z in nodes: k_z = nodes_Degree_dict[z] if k_z > 1: alpha = 2 / (k_z * (k_z - 1)) cc_z = nx.clustering(G, z) if cc_z == 0: log_c = beta else: log_c = -math.log2(cc_z) # end if s = 0 neighbor_list = nx.neighbors(G, z) size = len(neighbor_list) for i in range(size): m = neighbor_list[i] for j in range(i + 1, size): n = neighbor_list[j] if i != j: s += (self_Connect_dict[(nodes_Degree_dict[m], nodes_Degree_dict[n])] - log_c) self_Conditional_dict[z] = alpha * s sim_dict = {} # 存储相似度的字典 ebunch = nx.non_edges(G) i = 0 for x, y in ebunch: s = 0 #(k_x, k_y) = pair(degree_list[x], degree_list[y]) for z in nx.common_neighbors(G, x, y): s += self_Conditional_dict[z] sim_dict[(x, y)] = s - self_Connect_dict[(nodes_Degree_dict[x], nodes_Degree_dict[y])] #sim_dict[(y, x)] = s - self_Connect_dict[(degree_list[x], degree_list[y])] # end if # end for print(sim_dict) return sim_dict
def new_connections_predictions(): # Your Code Here #edges metrics: common neighbors, #get common neighbors n_common_neighbors = [((e[0], e[1]), len(sorted(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)] #Jaccard coefficient jaccard_coe = [((e[0], e[1]), e[2]) for e in nx.jaccard_coefficient(G)] #research allocation resource_allocation = [((e[0], e[1]), e[2]) for e in nx.resource_allocation_index(G)] #adamic_adar index adami_adar = [((e[0], e[1]), e[2]) for e in nx.adamic_adar_index(G)] #preferential attachement pref_attachement = [((e[0], e[1]), e[2]) for e in nx.preferential_attachment(G)] def convert_score_to_series(tupples): index = [edge[0] for edge in tupples] scores = [edge[1] for edge in tupples] scores = pd.Series(scores, index=index) return scores n_common_neighbors = convert_score_to_series(n_common_neighbors) jaccard_coe = convert_score_to_series(jaccard_coe) resource_allocation = convert_score_to_series(resource_allocation) adami_adar = convert_score_to_series(adami_adar) pref_attachement = convert_score_to_series(pref_attachement) non_edges_df = pd.concat([ n_common_neighbors, jaccard_coe, resource_allocation, adami_adar, pref_attachement ], axis=1) non_edges_df.columns = [ 'n_common_neighbors', 'jaccard_coe', 'resource_allocation', 'adami_adar', 'pref_attachement' ] non_edges_df = non_edges_df.join(future_connections, how='outer') validation = non_edges_df[non_edges_df['Future Connection'].isnull()] training = non_edges_df[non_edges_df['Future Connection'].notnull()] y = training['Future Connection'] x = training.drop(['Future Connection'], axis=1) validation = validation.drop(['Future Connection'], axis=1) from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.metrics import auc gbc = GradientBoostingClassifier() lr = LogisticRegression() # parameters = {'n_estimators' : [100, 200, 300], # 'max_depth' : [3,5,10], # 'random_state' : [42] # } parameters = {'penalty': ['l1', 'l2'], 'C': [1, 2], 'random_state': [42]} gs = GridSearchCV(lr, parameters, scoring='roc_auc', cv=10) gs.fit(x, y) prediction = gs.predict_proba(validation)[:, 1] prediction = pd.Series(prediction, index=validation.index) return prediction # Your Answer Here
def _bonds_from_names(graph, resname, nodes, force_field): """Add edges between `nodes` in `graph` based on atom names. Adds edges to `graph`, assuming the nodes in `nodes` constitute a residue with residue name `resname`, which can be found among the `force_field` blocks. Edges will be added as they are in the reference Block. In addition, all non-edges in the Block will be generated and returned. Parameters ---------- graph: networkx.Graph resname: str nodes: collections.abc.Iterable[collections.abc.Hashable] Should be node keys in `graph` force_field: vermouth.forcefield.ForceField Force field in which to look for the block with name `resname` Raises ------ KeyError If `resname` is not one of the blocks known to `force_field`; or when a residue contains duplicate atom names. Returns ------- Set[Frozenset[collections.abc.Hashable, collections.abc.Hashable]] All non-edges found in the block, with node keys from `graph`. """ block = force_field.blocks.get(resname) if not block: raise KeyError("Residue {} is not known to force field {}" "".format(resname, force_field.name)) mol_name_to_idx = defaultdict(set) for graph_idx in nodes: if 'atomname' in graph.nodes[graph_idx]: mol_name_to_idx[graph.nodes[graph_idx]['atomname']].add(graph_idx) mol_name_to_idx = dict(mol_name_to_idx) for name, graph_idxs in mol_name_to_idx.items(): if len(graph_idxs) > 1: raise KeyError("Residue has multiple atoms with atom name {}" "".format(name)) mol_name_to_idx[name] = mol_name_to_idx[name].pop() for block_idx, block_jdx in block.edges: block_idx_name = block.nodes[block_idx]['atomname'] block_jdx_name = block.nodes[block_jdx]['atomname'] if block_idx_name in mol_name_to_idx and block_jdx_name in mol_name_to_idx: graph_idx = mol_name_to_idx[block_idx_name] graph_jdx = mol_name_to_idx[block_jdx_name] pos1 = np.array(graph.nodes[graph_idx].get('position', np.full(3, np.nan))) pos2 = np.array(graph.nodes[graph_jdx].get('position', np.full(3, np.nan))) dist = np.sqrt(np.sum((pos1 - pos2)**2)) graph.add_edge(graph_idx, graph_jdx, distance=dist) non_edges = set() for block_idx, block_jdx in nx.non_edges(block): block_idx_name = block.nodes[block_idx]['atomname'] block_jdx_name = block.nodes[block_jdx]['atomname'] if block_idx_name in mol_name_to_idx and block_jdx_name in mol_name_to_idx: non_edges.add(frozenset((mol_name_to_idx[block_idx_name], mol_name_to_idx[block_jdx_name]))) return non_edges
def devide(category, dataname, ratio): print dataname G = nx.read_weighted_edgelist('./data/' + category + '/' + dataname + '.txt', nodetype=int) nonit = nx.non_edges(G) n = nx.number_of_nodes(G) n = n * (n - 1) / 2 nonedge = n - nx.number_of_edges(G) e = nx.number_of_edges(G) e = long(ratio * e) count = 0 nonedgechoose = [] while (True): tmp = np.random.random_integers(0, nonedge) if tmp not in nonedgechoose: nonedgechoose.append(tmp) count = count + 1 if count >= e: break nonedgechoose.sort() count = 0 G_neg = nx.Graph() for i in nonedgechoose: while count < i: next(nonit) count = count + 1 G_neg.add_edge(*next(nonit)) count = count + 1 it = nx.edges(G) n = nx.number_of_edges(G) n = long(n * ratio) count = 0 edgechoose = [] while (True): tmp = np.random.random_integers(0, nx.number_of_edges(G)) if tmp not in edgechoose: edgechoose.append(tmp) count = count + 1 if count >= n: break edgechoose.sort() G_train = nx.Graph() G_pos = nx.Graph() count = 0 index = 0 print len(edgechoose) for edge in it.data(False): if index >= len(edgechoose): G_train.add_edge(*edge) continue if count != edgechoose[index]: G_train.add_edge(*edge) count = count + 1 continue G_pos.add_edge(*edge) count = count + 1 index = index + 1 G_train = nx.DiGraph(G_train) nx.write_edgelist(G_train, 'dividedata/' + category + '/' + dataname + '.txt', data=False) nx.write_edgelist(G_pos, 'dividedata/' + category + '/' + dataname + '_pos.txt', data=False) nx.write_edgelist(G_neg, 'dividedata/' + category + '/' + dataname + '_neg.txt', data=False) print 'end'
#Graph edges in list form #Medges = [i for i in M.edges()] #Layout #pos=nx.fruchterman_reingold_layout(M, dim=2) N = len(M.nodes()) labels = [i[1]['name'] for i in M.nodes(data=True)] # ###################### Evolution ######################### import operator # Common Neighbors CN = [(e[0], e[1], len(list(nx.common_neighbors(M, e[0], e[1])))) for e in nx.non_edges(M)] CN.sort(key=operator.itemgetter(2), reverse=True) # Jaccard coef jaccard = list(nx.jaccard_coefficient(M)) jaccard.sort(key=operator.itemgetter(2), reverse=True) # Resource Allocation index RA = list(nx.resource_allocation_index(M)) RA.sort(key=operator.itemgetter(2), reverse=True) # Adamic-Adar index AA = list(nx.adamic_adar_index(M)) AA.sort(key=operator.itemgetter(2), reverse=True) # Preferential Attachement
def main(): print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #specify initial arguments for all functions manager = multiprocessing.Manager() network = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml').to_undirected() nonexist_edges = manager.list(list(nx.non_edges(network))) nonexist_edges = list(nx.non_edges(network)) # non-existent edges in graph iterations = 100 steps = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95] file = 'Results/Trametinib/NETS_Tram_' pool = multiprocessing.Pool(processes=4) # set up pool #Degree Product func = partial(DPFracAUC, network, nonexist_edges, iterations) DPres = pool.map(func, steps) print 'Finished running Degree Product' print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #write dictionary to json file with open(str(file) + 'DP.json', 'w') as fout: json.dump(DPres, fout) #Shortest Path func2 = partial(SPFracAUC, network, nonexist_edges, iterations) SPres = pool.map(func2, steps) print 'Finished running Shortest Path' print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #write dictionary to json file with open(str(file) + 'SP.json', 'w') as fout: json.dump(SPres, fout) #Common Neighbors func3 = partial(CNFracAUC, network, nonexist_edges, iterations) CNres = pool.map(func3, steps) print 'Finished running Common Neighbors' print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #write dictionary to json file with open(str(file) + 'CN.json', 'w') as fout: json.dump(CNres, fout) #Jaccard func4 = partial(JFracAUC, network, nonexist_edges, iterations) Jres = pool.map(func4, steps) print 'Finished running Jaccard Index' print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #write dictionary to json file with open(str(file) + 'J.json', 'w') as fout: json.dump(Jres, fout) #Sorensen Similarity func5 = partial(SSFracAUC, network, nonexist_edges, iterations) SSres = pool.map(func5, steps) print 'Finished running Sorensen Similarity' print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #write dictionary to json file with open(str(file) + 'SS.json', 'w') as fout: json.dump(SSres, fout) #Leicht-Holme-Newman func6 = partial(LHNFracAUC, network, nonexist_edges, iterations) LHNres = pool.map(func6, steps) print 'Finished running Leicht-Holme-Newman' print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #write dictionary to json file with open(str(file) + 'LHN.json', 'w') as fout: json.dump(LHNres, fout) #Adamic Advar func7 = partial(AAFracAUC, network, nonexist_edges, iterations) AAres = pool.map(func7, steps) print 'Finished running Adamic Advar' print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #write dictionary to json file with open(str(file) + 'AA.json', 'w') as fout: json.dump(AAres, fout) #Resource Allocation func8 = partial(RAFracAUC, network, nonexist_edges, iterations) RAres = pool.map(func8, steps) print 'Finished running Resource Allocation' print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #write dictionary to json file with open(str(file) + 'RA.json', 'w') as fout: json.dump(RAres, fout) #Katz func9 = partial(KFracAUC, network, nonexist_edges, iterations) Kres = pool.map(func9, steps) print 'Finished running Katz' print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) #write dictionary to json file with open(str(file) + 'K.json', 'w') as fout: json.dump(Kres, fout) # #Simrank # func10 = partial(SFracAUC, network, nonexist_edges, iterations) # Sres = pool.map(func10, steps) # print 'Finished running SimRank' # # write dictionary to json file # with open(str(file) + 'SR.json', 'w') as fout: # json.dump(Sres, fout) # Rooted Page Rank func11 = partial(PRFracAUC, network, nonexist_edges, iterations) RPRres = pool.map(func11, steps) print 'Finished running Rooted Page Rank' print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # write dictionary to json file with open(str(file) + 'RPR.json', 'w') as fout: json.dump(RPRres, fout) pool.close() pool.join() print str('Finished running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
def preferential_attachment_score(graph): non_edges = nx.non_edges(graph) return ((u, v, graph.degree(u) * graph.degree(v)) for u, v in non_edges)
def star_with_extra_edges(N, M): g = nx.star_graph(N) g.add_edges_from(random.sample(set(nx.non_edges(g)), M - len(g.edges()))) return g
data1 = data[data[:][6] > 0] data2 = data1.iloc[:, [0, 1]] data3 = data2.drop_duplicates() # data3.to_csv('./edges.csv',index=False) # In[4]: G = nx.read_edgelist('./edges.csv', delimiter=',', create_using=nx.Graph()) # nodes=pd.DataFrame(list(G.nodes())) # # nodes.to_csv('./all_nodes.csv',index=False) G08 = nx.read_edgelist('./edges08.csv', delimiter=',', create_using=nx.Graph()) G09 = nx.read_edgelist('./edges09.csv', delimiter=',', create_using=nx.Graph()) G08.add_nodes_from(G.nodes(data=True)) G09.add_nodes_from(G.nodes(data=True)) edges08 = pd.DataFrame(list(G08.edges())) edges09 = pd.DataFrame(list(G09.edges())) non_edges08 = pd.DataFrame(nx.non_edges(G08)) non_edges09 = pd.DataFrame(nx.non_edges(G09)) edges08.columns = ['Departure', 'locationID'] edges09.columns = ['Departure', 'locationID'] non_edges08.columns = ['Departure', 'locationID'] non_edges09.columns = ['Departure', 'locationID'] edges08['label'] = 1 edges09['label'] = 1 non_edges08['label'] = 0 non_edges09['label'] = 0 train08 = pd.concat([edges08, non_edges08]) test09 = pd.concat([edges09, non_edges09]) # In[5]: train_data = np.array(train08)
def make_pairs_with_edges(self, label_graph, target_positive_ratio=.5, enforce_non_edge=True, enforce_has_embeddings=False): """ Generate a dataframe with a fixed ratio of positives to negatives by requiring all new edges in label_graph to appear in the dataframe. :param label_graph: The graph to check for new edges :param target_positive_ratio: Ratio of positive to negative (default=.5) :return: A list of tuples containing target_positive_ratio edges to non-edges """ pairs = [] pairs_dict = defaultdict(bool) edges = 0 if target_positive_ratio == 0: # We want all the pairs from label_graph # todo: do we need pairs_dict for this part for u, v in label_graph.nx_graph.edges_iter(): if enforce_has_embeddings: if u not in self.embeddings or v not in self.embeddings: continue edges += 1 pairs.append((u, v)) for u, v in nx.non_edges(label_graph.nx_graph): if enforce_has_embeddings: if u not in self.embeddings or v not in self.embeddings: continue pairs.append((u, v)) print("\t%d edges out of %d pairs" % (edges, len(pairs))) return pairs for u, v in label_graph.nx_graph.edges_iter(): if enforce_has_embeddings and not self.embeddings: print("No embeddings found! Error!") return if enforce_has_embeddings: if u not in self.embeddings or v not in self.embeddings: continue if (enforce_non_edge and not self.nx_graph.has_edge(u, v)) or not enforce_non_edge: u, v = sorted((u, v)) if not pairs_dict[(u, v)]: pairs_dict[(u, v)] = True pairs.append((u, v)) edges += 1 nodes = self.embeddings.keys() added = 0 rejected = 0 while float(edges) / len(pairs) > target_positive_ratio: u = nodes[int(random.random() * len(nodes))] v = nodes[int(random.random() * len(nodes))] if label_graph.nx_graph.has_edge(u, v) or u == v: rejected += 1 continue if enforce_has_embeddings: if u not in self.embeddings or v not in self.embeddings: rejected += 1 continue (u, v) = sorted((u, v)) if not pairs_dict[(u, v)]: pairs_dict[(u, v)] = True pairs.append((u, v)) added += 1 return pairs
def LP(graph_file, out_file, sim_method, t, p): G = nx.read_edgelist(graph_file, nodetype=int) #G = G.to_undirected() #G = nx.convert_node_labels_to_integers(G) # for debug # print(nx.nodes(G)) node_num = nx.number_of_nodes(G) edge_num = nx.number_of_edges(G) # 列出所有不存在的链接,存放到non_edge_list中 # non_edge_num = (node_num * (node_num - 1)) / 2 - edge_num non_edge_list = [pair(u, v) for u, v in nx.non_edges(G)] non_edge_num = len(non_edge_list) # for debug print("V: %d\tE: %d\tNon: %d" % (node_num, edge_num, non_edge_num)) # for debug # print(len(non_edge_list)) # print(non_edge_list) # 执行t次独立的实验,每次从G中选择p*100%的链接作为测试集,剩余的链接作为训练集 test_num = int(edge_num * p) pre_num = 0 for l in range(2, 101, 2): if l < 20: pre_num += 1 else: break # end if # end for pre_num += 1 # for debug print('test_edge_num: %d' % test_num) # 定义数组存放性能值 auc_list = [] rs_list = [] time_list = [] pre_matrix = [[0 for it in range(t)] for num in range(pre_num)] # 迭代t次进行测试 for it in range(t): if it % 10 == 0: print('turn: %d' % it) # end if # 首先产生一批随机数 seed = math.sqrt(edge_num * node_num) + math.pow( (1 + it) * 10, 3) # 随机数种子 random.seed(seed) rand_set = set(random.sample(range(edge_num), test_num)) # rand_set = set() # i = 0 # while (i < test_num): # r = random.randint(0, edge_num - 1) # if (r not in rand_set): # rand_set.add(r) # i += 1 # # end if # # end while # for debug # print(rand_set) # print(len(rand_set)) # 遍历G中链接,根据rand_set中的值分成训练集和测试集 training_graph = nx.Graph() training_graph.add_nodes_from(range(node_num)) test_edge_list = [] r = 0 for u, v in nx.edges_iter(G): u, v = pair(u, v) # for debug # print(u, v) if r in rand_set: # 测试链接 test_edge_list.append((u, v)) else: training_graph.add_edge(u, v) # 训练网络 # end if r += 1 # end for training_graph.to_undirected() # for debug # print(len(test_edge_list)) # print(test_edge_list) # print(nx.number_of_edges(training_graph)) # print(nx.number_of_nodes(training_graph)) # print(nx.nodes(training_graph)) # print(nx.edges(training_graph)) # 计算相似度 # if (it % 10 == 0): # print('计算相似度') start = datetime.datetime.now() sim_dict = similarities(training_graph, sim_method) end = datetime.datetime.now() # 0. 计算时间 time_list.append((end - start).microseconds) # 1. 计算AUC auc_value = AUC(sim_dict, test_edge_list, non_edge_list) auc_list.append(auc_value) # for debug # print(auc_value) # 创建一个数组,存放顶点对的相似度 sim_list = [((u, v), s) for (u, v), s in sim_dict.items()] # sim_dict不在需要 sim_dict.clear() # 对sim_list按照相似度降序排列 sim_list.sort(key=lambda x: (x[1], x[0]), reverse=True) # 2. 计算Ranking Score rank_score = Ranking_score(sim_list, test_edge_list, non_edge_num) rs_list.append(rank_score) # for debug # print(rank_score) # 3. 计算精度列表 pre_list = Precision(sim_list, test_edge_list, test_num) for num in range(pre_num): pre_matrix[num][it] = pre_list[num] # end for # end for # 计算平均值和方差,并将结果输出到文件 auc_avg, auc_std = stats(auc_list) print('AUC: %.4f(%.4f)' % (auc_avg, auc_std)) out_file.write('%.4f(%.4f)\t' % (auc_avg, auc_std)) rs_avg, rs_std = stats(rs_list) print('Ranking_Score: %.4f(%.4f)' % (rs_avg, rs_std)) out_file.write('%.4f(%.4f)\t' % (rs_avg, rs_std)) time_avg, time_std = stats(time_list) print('Time: %.4f(%.4f)' % (time_avg, time_std)) out_file.write('%.4f(%.4f)\t' % (time_avg, time_std)) pre_avg_list = [] pre_std_list = [] for num in range(pre_num): pre_avg, pre_std = stats(pre_matrix[num]) pre_avg_list.append(pre_avg) pre_std_list.append(pre_std) # end for print('Precision: ') # out_file.write('\nPrecision: ') for num in range(pre_num): print('%.4f(%.4f)\t' % (pre_avg_list[num], pre_std_list[num])) out_file.write('%.4f(%.4f)\t' % (pre_avg_list[num], pre_std_list[num])) # end for out_file.write('%d\n' % test_num)
def sample_subgraph(graph, offset=0, use_precomp_sizes=False, filter_negs=False, supersample_small_graphs=False, neg_target=None, hard_neg_idxs=None): if neg_target is not None: graph_idx = graph.G.graph["idx"] use_hard_neg = (hard_neg_idxs is not None and graph.G.graph["idx"] in hard_neg_idxs) done = False n_tries = 0 while not done: if use_precomp_sizes: size = graph.G.graph["subgraph_size"] else: if train and supersample_small_graphs: sizes = np.arange(self.min_size + offset, len(graph.G) + offset) ps = (sizes - self.min_size + 2)**(-1.1) ps /= ps.sum() size = stats.rv_discrete(values=(sizes, ps)).rvs() else: d = 1 if train else 0 size = random.randint(self.min_size + offset - d, len(graph.G) - 1 + offset) start_node = random.choice(list(graph.G.nodes)) neigh = [start_node] frontier = list( set(graph.G.neighbors(start_node)) - set(neigh)) visited = set([start_node]) while len(neigh) < size: new_node = random.choice(list(frontier)) assert new_node not in neigh neigh.append(new_node) visited.add(new_node) frontier += list(graph.G.neighbors(new_node)) frontier = [x for x in frontier if x not in visited] if self.node_anchored: anchor = neigh[0] for v in graph.G.nodes: graph.G.nodes[v]["node_feature"] = ( torch.ones(1) if anchor == v else torch.zeros(1)) #print(v, graph.G.nodes[v]["node_feature"]) neigh = graph.G.subgraph(neigh) if use_hard_neg and train: neigh = neigh.copy() if random.random( ) < 1.0 or not self.node_anchored: # add edges non_edges = list(nx.non_edges(neigh)) if len(non_edges) > 0: for u, v in random.sample( non_edges, random.randint(1, min(len(non_edges), 5))): neigh.add_edge(u, v) else: # perturb anchor anchor = random.choice(list(neigh.nodes)) for v in neigh.nodes: neigh.nodes[v]["node_feature"] = (torch.ones(1) if anchor == v else torch.zeros(1)) if (filter_negs and train and len(neigh) <= 6 and neg_target is not None): matcher = nx.algorithms.isomorphism.GraphMatcher( neg_target[graph_idx], neigh) if not matcher.subgraph_is_isomorphic(): done = True else: done = True return graph, DSGraph(neigh)
def generate_pos_neg_links(self): # Select n edges at random (positive samples) n_edges = self.G.number_of_edges() n_nodes = self.G.number_of_nodes() npos = int(self.prop_pos * n_edges) nneg = int(self.prop_neg * n_edges) n_neighbors = [len(list(self.G.neighbors(v))) for v in self.G.nodes()] n_non_edges = n_nodes - 1 - np.array(n_neighbors) non_edges = [e for e in nx.non_edges(self.G)] if VERBOSE: print("\tFinding %d of %d non-edges" % (nneg, len(non_edges))) # Select m pairs of non-edges (negative samples) rnd_inx = self._rnd.choice(len(non_edges), nneg, replace=False) neg_edge_list = [non_edges[ii] for ii in rnd_inx] if len(neg_edge_list) < nneg: raise RuntimeWarning("\tOnly %d negative edges found" % (len(neg_edge_list))) if VERBOSE: print("\tFinding %d positive edges of %d total edges" % (npos, n_edges)) # Find positive edges, and remove them. edges = self.G.edges() edges = list(edges) pos_edge_list = [] n_count = 0 n_ignored_count = 0 rnd_inx = self._rnd.permutation(n_edges) for eii in rnd_inx.tolist(): edge = edges[eii] # Remove edge from graph data = self.G[edge[0]][edge[1]] self.G.remove_edge(*edge) reachable_from_v1 = nx.connected._plain_bfs(self.G, edge[0]) if edge[1] not in reachable_from_v1: self.G.add_edge(*edge, **data) n_ignored_count += 1 else: pos_edge_list.append(edge) if VERBOSE: sys.stdout.write("\r" + "\tFound: {} edges".format(n_count + 1)) n_count += 1 if n_count >= npos: break if VERBOSE: sys.stdout.write("\n") edges_num = len(pos_edge_list) self._pos_edge_list = pos_edge_list self._neg_edge_list = neg_edge_list # print('pos_edge_list', len(self._pos_edge_list)) # print('neg_edge_list', len(self._neg_edge_list)) if VERBOSE: print("\tEdge list lengths: Pos: {} Neg: {}".format( len(self._pos_edge_list), len(self._neg_edge_list)))
def generate_pos_neg_links(nx_graph, merge_network, test_para): '''生成正负样例边''' Multi_Networks = copy.deepcopy(nx_graph) # train_g = copy.deepcopy(merge_network) selected_layer = random.randint(0, len(Multi_Networks)) train_g = copy.deepcopy(Multi_Networks[selected_layer]) train_ng = Multi_Networks.remove(train_g) # 获取网络中存在的边 exit_edges = list(train_g.edges()) num_exit = len(exit_edges) # 获取网络中不存在的边 noexit_edges = list(nx.non_edges(train_g)) num_noexit = len(noexit_edges) # 随机化列表的序列 random.shuffle(exit_edges) random.shuffle(noexit_edges) # 正例边的采样 pos_edge_list = [] n_count = 0 edges = exit_edges rnd = np.random.RandomState(seed=None) rnd_inx = rnd.permutation(edges) # 基于随机种子产生下标 for eii in rnd_inx: edge = eii # 删除该边 data = train_g[edge[0]][edge[1]] train_g.remove_edge(*edge) # 测试存在的边在删除之后,整个网络能否联通 if nx.is_connected(train_g): flag = True for g in Multi_Networks: if edge in g.edges(): gt = copy.deepcopy(g) gt.remove_edge(*edge) if nx.is_connected(gt) == False: del gt flag = False break if flag: for g in Multi_Networks: if edge in g.edges(): g.remove_edge(*edge) pos_edge_list.append(tuple(edge)) n_count += 1 else: train_g.add_edge(*edge, **data) else: train_g.add_edge(*edge, **data) # 正采样的边 if not len(pos_edge_list): # 如果原始图都是空的,那么就没有意义,所以就随机选择一定数量的边 pos_edge_list = exit_edges[:int(len(exit_edges) * test_para)] [ g.remove_edge(*e) for g in Multi_Networks for e in pos_edge_list if e in g.edges() ] [train_g.remove_edge(*e) for e in pos_edge_list] nneg = npos = len(pos_edge_list) else: # 确定测试边的个数 if len(pos_edge_list) < num_noexit: npos = int(test_para * len(pos_edge_list)) # 正例的数量 else: npos = int(test_para * num_noexit) nneg = npos # 负例的数量 pos_edge_list = pos_edge_list[:nneg] # 负采样的边 neg_edge_list = noexit_edges[:nneg] # 测试边数据集和标签 test_edges, labels = get_selected_edges(pos_edge_list, neg_edge_list) return Multi_Networks, train_g, pos_edge_list, neg_edge_list, test_edges, labels
def MI(graph_file): G = nx.read_edgelist(graph_file) node_num = nx.number_of_nodes(G) edge_num = nx.number_of_edges(G) print(node_num) print(edge_num) sim_dict = {} # �洢���ƶȵ��ֵ� I_pConnect_dict = {} pDisConnect = 1 edges = nx.edges(G) ebunch = nx.non_edges(G) nodes = nx.nodes(G) nodes_Degree_dict = {} for v in nodes: nodes_Degree_dict[v] = nx.degree(G, v) # 需要经常获取顶点的度,因此,可以事先存储下来 # degree_list = [nx.degree(G, v) for v in range(G.number_of_nodes())] # 下面的两个循环计算$P(L^1_{xy})$,其实我们只需要计算不同度的值$P(L^1_{kxky})$ # ============================================================================= # degree_I_pConnect = {} # for u, v in edges: # ============================================================================= for u, v in edges: uDegree = nodes_Degree_dict[u] vDegree = nodes_Degree_dict[v] for i in range(1, vDegree + 1): pDisConnect = pDisConnect * (((edge_num - uDegree) - i + 1) / (edge_num - i + 1)) pConnect = 1 - pDisConnect if pConnect == 0: I_pConnect = -math.log2(0.0001) else: I_pConnect = -math.log2(pConnect) I_pConnect_dict[(u, v)] = I_pConnect I_pConnect_dict[(v, u)] = I_pConnect pDisConnect = 1 for m, n in ebunch: # ============================================================================= # mDegree = nx.degree(G, m) # nDegree = nx.degree(G, n) # ============================================================================= mDegree = nodes_Degree_dict[m] nDegree = nodes_Degree_dict[n] for i in range(1, nDegree + 1): pDisConnect = pDisConnect * (((edge_num - mDegree) - i + 1) / (edge_num - i + 1)) pConnect = 1 - pDisConnect if pConnect == 0: I_pConnect = -math.log2(0.0001) else: I_pConnect = -math.log2(pConnect) I_pConnect_dict[(m, n)] = I_pConnect I_pConnect_dict[(n, m)] = I_pConnect pDisConnect = 1 ebunchs = nx.non_edges(G) i = 0 # $I(L^1_{xy};z) = I(L^1;z)$,与x, y没有关系,可以先计算出来 for u, v in ebunchs: pMutual_Information = 0 I_pConnect = I_pConnect_dict[(u, v)] for z in nx.common_neighbors(G, u, v): neighbor_num = len(list(nx.neighbors(G, z))) neighbor_list = nx.neighbors(G, z) for m in range(len(neighbor_list)): for n in range(m + 1, len(neighbor_list)): if m != n: I_ppConnect = I_pConnect_dict[(neighbor_list[m], neighbor_list[n])] if nx.clustering(G, z) == 0: pMutual_Information = pMutual_Information + ( 2 / (neighbor_num * (neighbor_num - 1))) * ((I_ppConnect) - (-math.log2(0.0001))) else: pMutual_Information = pMutual_Information + ( 2 / (neighbor_num * (neighbor_num - 1))) * ( (I_ppConnect) - (-math.log2(nx.clustering(G, z)))) sim_dict[(u, v)] = -(I_pConnect - pMutual_Information) i = i + 1 #print(i) print(str(u) + "," + str(v)) print(sim_dict[(u, v)]) return sim_dict
sampleTest=sample(g_training.edges(), sizeTestSet) #Graph for the test sample g_test=nx.Graph() g_test.add_edges_from(sampleTest) #remove from g_training the edges in sampleTest g_training.remove_edges_from(sampleTest) print(g_training.edges()) #Finally, convert the remaining edges as a series. samplePositiveTraining=pd.Series(data=g_training.edges()) #3/ To balance the training set, we will randomly pick pairs of unconnected vertices (negative class). #The number of pairs should be equal to the number of considered connections (positive class) in the training set. Find a way to generate this negative training set and name it sampleNegativeTraining. import numpy as np non_edges = list(nx.non_edges(g_training)) sample_num = len(g_training.edges()) sample = sample(non_edges, sample_num) sampleNegativeTraining=pd.Series(data=sample) #add new edges in the training graph based on the negative sample g_training.add_edges_from(sampleNegativeTraining) #5/ Use the following code (and modify it if necessary) to create 2 empty data frames (one for the training set and the other for the test set). import numpy as np sampleTraining = pd.concat([samplePositiveTraining,sampleNegativeTraining,],ignore_index=True) dfTraining_1 = pd.DataFrame((list(sampleTraining)), columns=["target","source"]) dfTraining_2 = pd.DataFrame(np.zeros((sizeTrainingSet, 11)), columns=features)
def read_csv_files(): """ Read data and create MultiDiGraph.Each Node has an id and All edges have 2 attributes. The first is Timestamp and the second is the type of edge (Attacks, Trades, Messages) :return: G, all_dfs, labels """ file_names = glob.glob("../data_users_moves/*.csv") all_dfs = pd.DataFrame(columns=['Timestamp', 'id1', 'id2', 'label']) for file in file_names: print(str(file)) df = pd.read_csv(file, header=None) df.columns = ['Timestamp', 'id1', 'id2'] df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') # df['date'] = [d.date() for d in df['Timestamp']] # df['time'] = [d.time() for d in df['Timestamp']] if 'attack' in file: rel_type = 'attacks' elif 'trade' in file: rel_type = 'trades' else: rel_type = 'messages' df['type'] = rel_type df['weight'] = 1 df['label'] = 1 all_dfs = pd.concat([all_dfs, df]) graph = nx.from_pandas_edgelist(df=all_dfs, source='id1', target='id2', edge_attr=True, create_using=nx.MultiDiGraph(name='Travian_Graph')) g_undirected = nx.from_pandas_edgelist(df=all_dfs, source='id1', target='id2', edge_attr=True, create_using=nx.Graph(name='Travian_Graph')) # Create negative samples ---! source = all_dfs['id1'].tolist() destination = all_dfs['id2'].tolist() # combine all nodes in a list node_list = source + destination # remove duplicate items from the list node_list = list(dict.fromkeys(node_list)) adj_G = nx.to_numpy_matrix(graph, nodelist=node_list) # get unconnected node-pairs all_unconnected_pairs = [] # print(nx.non_edges(G)) ommisible_links_data = pd.DataFrame(nx.non_edges(graph)).sample(frac=1).reset_index(drop=True) dates = pd.date_range('2009-12-01 00:00:00', '2009-12-31 23:59:59', periods=200000) gen_df = ommisible_links_data.iloc[:200000, :] gen_df.columns = ['id1', 'id2'] gen_df[['id1', 'id2']] = gen_df[['id1', 'id2']].applymap(np.int64) gen_df['Timestamp'] = dates gen_df['label'] = 0 gen_df['weight'] = 1 gen_df['type'] = random.choices(['attacks', 'messages', 'trades'], weights=(50, 25, 25), k=200000) gen_df['Preferential_Attachment'] = 0 gen_df['Resource_allocation'] = 0 # Merge dataset with links that doesnt exist # print(gen_df) labels = {e: graph.edges[e]['type'] for e in graph.edges} return graph, all_dfs, labels, g_undirected, gen_df
for link_id in selected_links_id: selected_links.append(links[link_id]) network_train.remove_edges_from(selected_links) network_test.add_edges_from(selected_links) #####print("network_train.number_of_edges(), network_test.number_of_edges():",network_train.number_of_edges(), network_test.number_of_edges()) #####print("# ## Sampling negative links") k = 2 n_links_train_pos = network_train.number_of_edges() n_links_test_pos = network_test.number_of_edges() n_links_train_neg = k * n_links_train_pos n_links_test_neg = k * n_links_test_pos neg_network = nx.empty_graph(network.number_of_nodes()) links_neg = list(nx.non_edges(network)) neg_network.add_edges_from(links_neg) n_links_neg = neg_network.number_of_edges() ######print("n_links_neg:",n_links_neg) selected_links_neg_id = np.random.choice(np.arange(n_links_neg), size=n_links_train_neg + n_links_test_neg, replace=False) neg_network_train = nx.empty_graph(network.number_of_nodes()) neg_network_test = nx.empty_graph(network.number_of_nodes()) selected_links = [] for i in range(n_links_train_neg):
nx.draw_networkx(g) d = g.degree() h = pd.DataFrame(d)[1].hist() nx.average_clustering(g) nx.average_shortest_path_length(g) # Connected Small World Network (run Watts up to t times till it returns a connected network) g = nx.connected_watts_strogatz_graph(100, 6, 0.04, 50) nx.draw_networkx(g) # Newman Watts (adding new edges instead of rewiring) g = nx.newman_watts_strogatz_graph(100, 6, 0.04) nx.draw_networkx(g) # Link Prediction # Common Neighbors cn = [(x[0], x[1], len(list(nx.common_neighbors(g, x[0], x[1])))) for x in nx.non_edges(g)] # Jaccard Coefficient (# of common neighbors/total neighbors) jc = list(nx.jaccard_coefficient(g)) # Resources Allocation (sum of fractions of the end node receive from middle nodes based on their degrees) ra = list(nx.resource_allocation_index(g)) # Adamic-Adar Index (Resources Allocation with log of degrees) aa = list(nx.adamic_adar_index(g)) # Preferential Attachment (product of nodes' degree) pa = list(nx.preferential_attachment(g)) # Community Common Neighbors (with bonus for nieghbors in the same community) g.nodes[0]['community'] = 0 g.nodes[1]['community'] = 1 g.nodes[2]['community'] = 0 g.nodes[3]['community'] = 1 g.nodes[4]['community'] = 1 g.nodes[5]['community'] = 0
# -*- coding: utf-8 -*- import networkx as nx import matplotlib.pyplot as plt G = nx.frucht_graph() G2 = nx.Graph() for e in nx.non_edges(G): G2.add_edge(*e) plt.figure(figsize=(8, 8)) pos = nx.spring_layout(G) nx.draw_networkx_nodes(G, pos=pos) # nx.draw_networkx_labels(G, pos, {0: "1", 1: "2", 2: "3"}) nx.draw_networkx_edges(G2, pos=pos, edge_color="red") nx.draw_networkx_edges(G, pos=pos, edge_color="blue") plt.tight_layout() plt.axis("off") plt.savefig("schema.png") G = nx.Graph() G.add_edge(1, 2) G.add_edge(1, 3) plt.figure(figsize=(3, 3)) pos = nx.spring_layout(G) nx.draw_networkx_nodes(G, pos=pos) nx.draw_networkx_labels(G, pos) nx.draw_networkx_edges(G, pos=pos, edge_color="blue") plt.tight_layout()
# G=nx.Graph(sub_edges) G.remove_edges_from(G.selfloop_edges()) # the one-fold of YES edges total_edges=list(G.edges()) np.random.shuffle(total_edges) l=int(len(total_edges)*0.7) # keep 70% graph, 30% for growth labels # edges_0 exist only for common neighbors edges_0,ETEs = total_edges[:l], total_edges[l:] """ Use all the edges present in the network as "YES", and randomly choose equal number of "Non-existing" edges as "NO". """ # Randomly choose equal sized (entire)fold of NO edges nonETEs=random.sample(list(nx.non_edges(G)),len(ETEs)) total_edges=ETEs+nonETEs #total in consideration xe,nxe=len(ETEs),len(nonETEs) methods=['CN','JC','AA','RA','PA'] ## ## NOTE: HAVENT CHECKED FOR CONNECTIVITY MAINTAINED. # len(nx.bfs_tree(G,nodelist[0]).edges()) # # extract matrix in order, and convert to dense representation # A = nx.adjacency_matrix(G, nodelist=nodelist).todense() N=G.number_of_nodes() # store index for node index nodelist = list(G.nodes())
def split_into_train_test_sets(self, ratio, max_trial_limit=10000): test_set_size = int(ratio * self.number_of_edges) train_set_size = self.number_of_edges - test_set_size # Generate the positive test edges test_pos_samples = [] residual_g = self.g.copy() num_of_ccs = nx.number_connected_components(residual_g) if num_of_ccs != 1: raise ValueError( "The graph contains more than one connected component!") num_of_pos_samples = 0 edges = list(residual_g.edges()) perm = np.arange(len(edges)) np.random.shuffle(perm) edges = [edges[inx] for inx in perm] for i in range(len(edges)): # Remove the chosen edge chosen_edge = edges[i] residual_g.remove_edge(chosen_edge[0], chosen_edge[1]) if chosen_edge[1] in nx.connected._plain_bfs( residual_g, chosen_edge[0]): num_of_pos_samples += 1 test_pos_samples.append(chosen_edge) print("\r{0} tp edges found out of {1}".format( num_of_pos_samples, test_set_size)), else: residual_g.add_edge(chosen_edge[0], chosen_edge[1]) if num_of_pos_samples == test_set_size: break if num_of_pos_samples != test_set_size: raise ValueError("Not pos edges found!") # Generate the negative samples test_neg_samples = [] non_edges = list(nx.non_edges(self.g)) perm = np.arange(len(non_edges)) np.random.shuffle(perm) non_edges = [non_edges[inx] for inx in perm] chosen_non_edge_inx = np.random.choice(perm, size=test_set_size, replace=False) test_neg_samples = [non_edges[perm[p]] for p in chosen_non_edge_inx] """ while num_of_removed_edges < test_set_size: # Randomly choose an edge index pos_inx = np.arange(residual_g.number_of_edges()) np.random.shuffle(pos_inx) edge_inx = np.random.choice(a=pos_inx) # Remove the chosen edge chosen_edge = list(residual_g.edges())[edge_inx] residual_g.remove_edge(chosen_edge[0], chosen_edge[1]) #reachable_from_v1 = nx.connected._plain_bfs(self.G, edge[0]) if chosen_edge[1] in nx.connected._plain_bfs(residual_g, chosen_edge[0]): num_of_removed_edges += 1 test_pos_samples.append(chosen_edge) trial_counter = 0 else: residual_g.add_edge(chosen_edge[0], chosen_edge[1]) trial_counter += 1 if trial_counter == max_trial_limit: raise ValueError("In {} trial, any possible edge for removing could not be found!") print("\r{0} tp edges found out of {1}".format(num_of_removed_edges, test_set_size)), # Generate the negative samples test_neg_samples = [] num_of_neg_samples = 0 while num_of_neg_samples < test_set_size: pos_inx = np.arange(self.g.number_of_nodes()) np.random.shuffle(pos_inx) # Self-loops are allowed u, v = np.random.choice(a=pos_inx, size=2) candiate_edge = (unicode(u), unicode(v)) if not self.g.has_edge(candiate_edge[0], candiate_edge[1]) and candiate_edge not in self.g.edges(): test_neg_samples.append(candiate_edge) num_of_neg_samples += 1 print("\r{0} fn edges found out of {1}".format(num_of_neg_samples, test_set_size)), """ return residual_g, test_pos_samples, test_neg_samples
def MI(graph_file): G = nx.read_edgelist(graph_file) node_num = nx.number_of_nodes(G) edge_num = nx.number_of_edges(G) print(node_num) print(edge_num) sim_dict = {} # �洢���ƶȵ��ֵ� I_pConnect_dict = {} pDisConnect = 1 edges = nx.edges(G) ebunch = nx.non_edges(G) for u, v in edges: uDegree = nx.degree(G, u) vDegree = nx.degree(G, v) for i in range(1, vDegree + 1): pDisConnect = pDisConnect * (((edge_num - uDegree) - i + 1) / (edge_num - i + 1)) pConnect = 1 - pDisConnect if pConnect == 0: I_pConnect = -math.log2(0.0001) else: I_pConnect = -math.log2(pConnect) I_pConnect_dict[(u, v)] = I_pConnect I_pConnect_dict[(v, u)] = I_pConnect pDisConnect = 1 for m, n in ebunch: mDegree = nx.degree(G, m) nDegree = nx.degree(G, n) for i in range(1, nDegree + 1): pDisConnect = pDisConnect * (((edge_num - mDegree) - i + 1) / (edge_num - i + 1)) pConnect = 1 - pDisConnect if pConnect == 0: I_pConnect = -math.log2(0.0001) else: I_pConnect = -math.log2(pConnect) I_pConnect_dict[(m, n)] = I_pConnect I_pConnect_dict[(n, m)] = I_pConnect pDisConnect = 1 ebunchs = nx.non_edges(G) i = 0 for u, v in ebunchs: pMutual_Information = 0 I_pConnect = I_pConnect_dict[(u, v)] for z in nx.common_neighbors(G, u, v): neighbor_num = len(list(nx.neighbors(G, z))) neighbor_list = nx.neighbors(G, z) for m in range(len(neighbor_list)): for n in range(m + 1, len(neighbor_list)): if m != n: I_ppConnect = I_pConnect_dict[(neighbor_list[m], neighbor_list[n])] if nx.clustering(G, z) == 0: pMutual_Information = pMutual_Information + ( 2 / (neighbor_num * (neighbor_num - 1))) * ((I_ppConnect) - (-math.log2(0.0001))) else: pMutual_Information = pMutual_Information + ( 2 / (neighbor_num * (neighbor_num - 1))) * ( (I_ppConnect) - (-math.log2(nx.clustering(G, z)))) sim_dict[(u, v)] = -(I_pConnect - pMutual_Information) i = i + 1 #print(i) print(str(u) + "," + str(v)) print(sim_dict[(u, v)]) return sim_dict
def new_connections_predictions(): import operator from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, recall_score, auc, roc_curve, precision_score from sklearn.ensemble import GradientBoostingClassifier df = future_connections common_neigh = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)] common_neigh = sorted(common_neigh, key=operator.itemgetter(0)) jaccard_coef = list(nx.jaccard_coefficient(G)) jaccard_coef = sorted(jaccard_coef, key=operator.itemgetter(0)) resource_alloc = list(nx.resource_allocation_index(G)) resource_alloc = sorted(resource_alloc, key=operator.itemgetter(0)) pref_attach = list(nx.preferential_attachment(G)) pref_attach = sorted(pref_attach, key=operator.itemgetter(0)) df["edge"] = df.index df = df.sort_values(by="edge") df = df.drop( ["edge"], axis=1 ) #do not understand why these columns were showing up without them being assigned df["common neighbors"] = list(common_neigh) df["common neighbors"] = df["common neighbors"].apply(lambda x: x[2]) df["jaccard"] = [x[2] for x in jaccard_coef] df["resource allocation"] = [x[2] for x in resource_alloc] df["preferential attachment"] = [x[2] for x in pref_attach] #Separate the data with future connection reported from the rows where no data is reported conn_data = df.dropna() no_conn_data = df[df["Future Connection"].isnull()] x = conn_data.drop(["Future Connection"], axis=1) y = conn_data["Future Connection"] test_df = no_conn_data.drop(["Future Connection"], axis=1) #print (df) #Training the gradient boosting model X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.9, random_state=0) gbm = GradientBoostingClassifier(random_state=0, learning_rate=0.1, n_estimators=45, max_depth=5).fit(X_train, y_train) y_score_eval = gbm.decision_function(X_test) y_proba_eval = gbm.predict_proba(X_test) y_score = gbm.decision_function(test_df) y_proba = gbm.predict_proba(test_df) fpr, tpr, _ = roc_curve(y_test, y_score_eval) roc_auc = auc(fpr, tpr) prob_edge = pd.Series(y_proba[:, 1]) prob_edge.index = test_df.index return prob_edge
def _apply_prediction(G, func, ebunch=None): if ebunch is None: ebunch = nx.non_edges(G) return ((u, v, func(u, v)) for u, v in ebunch)
def WMI(G): #G = nx.read_edgelist(graph_file) edges = nx.edges(G) nodes = nx.nodes(G) beta = -math.log2(0.0001) sim_dict = {} # 得到图中所有边的权值之和 all_weight = 0 for u, v in edges: all_weight = all_weight + G.get_edge_data(u, v)['weight'] print(all_weight) # 计算图中不同‘点权值’的点之间相连的互信息 nodes_Weight_dict = {} weight_list = [] # 得到每个点的“点权值” for v in nodes: node_weight = 0 v_neighbors = nx.neighbors(G, v) for u in v_neighbors: node_weight += G.get_edge_data(u, v)['weight'] weight_list.append(node_weight) nodes_Weight_dict[v] = node_weight #print(weight_list) #print(nodes_Weight_dict) distinct_weight_list = list(set(weight_list)) #print(distinct_weight_list) size = len(distinct_weight_list) #print(size) self_Connect_dict = {} #得到不同‘点权值’的点之间相连的互信息 for x in range(size): w_x = distinct_weight_list[x] for y in range(x, size): w_y = distinct_weight_list[y] p0 = 1 (w_n, w_m) = pair(w_x, w_y) a = all_weight + 1 b = all_weight - w_m + 1 for i in range(1, int(w_n + 1)): p0 *= (b - i) / (a - i) if p0 == 1: self_Connect_dict[(w_n, w_m)] = beta #self_Connect_dict[(w_m, w_n)] = beta else: self_Connect_dict[(w_n, w_m)] = -math.log2(1 - p0) #self_Connect_dict[(w_m, w_n)] = -math.log2(1 - p0) #print (str(w_n) + "," + str(w_m)) #print (self_Connect_dict[(w_n, w_m)]) #print(self_Connect_dict) self_Conditional_dict = {} for z in nodes: w_z = nodes_Weight_dict[z] if w_z > 1: alpha = 2 / (w_z * (w_z - 1)) cc_z = wc2.weight_clustering2(G, z) #修改为加权聚类系数 if cc_z == 0: log_c = beta else: log_c = -math.log2(cc_z) # end if s = 0 neighbor_list = nx.neighbors(G, z) size = len(neighbor_list) for i in range(size): m = neighbor_list[i] for j in range(i + 1, size): n = neighbor_list[j] (k_x, k_y) = pair(nodes_Weight_dict[m], nodes_Weight_dict[n]) if i != j: s += (self_Connect_dict[(k_x, k_y)] - log_c) self_Conditional_dict[z] = alpha * s #print(self_Conditional_dict) sim_dict = {} # 存储相似度的字典 ebunch = nx.non_edges(G) for x, y in ebunch: s = 0 (k_x, k_y) = pair(nodes_Weight_dict[x], nodes_Weight_dict[y]) for z in nx.common_neighbors(G, x, y): s += self_Conditional_dict[z] sim_dict[(x, y)] = s - self_Connect_dict[(k_x, k_y)] # end if # end for print(sim_dict) return sim_dict
def generate_pos_neg_links(self): """ Select random existing edges in the graph to be postive links, and random non-edges to be negative links. Modify graph by removing the postive links. """ # Select n edges at random (positive samples) n_edges = self.G.number_of_edges() n_nodes = self.G.number_of_nodes() npos = int(self.prop_pos * n_edges) nneg = int(self.prop_neg * n_edges) if not nx.is_connected(self.G): raise RuntimeError("Input graph is not connected") n_neighbors = [len(list(self.G.neighbors(v))) for v in list(self.G.nodes())] n_non_edges = n_nodes - 1 - np.array(n_neighbors) non_edges = [e for e in nx.non_edges(self.G)] print("Finding %d of %d non-edges" % (nneg, len(non_edges))) # Select m pairs of non-edges (negative samples) rnd_inx = self._rnd.choice(len(non_edges), nneg, replace=False) neg_edge_list = [non_edges[ii] for ii in rnd_inx] if len(neg_edge_list) < nneg: raise RuntimeWarning( "Only %d negative edges found" % (len(neg_edge_list)) ) print("Finding %d positive edges of %d total edges" % (npos, n_edges)) # Find positive edges, and remove them. edges = list(self.G.edges()) pos_edge_list = [] n_count = 0 n_ignored_count = 0 rnd_inx = self._rnd.permutation(n_edges) for eii in rnd_inx: edge = edges[eii] # Remove edge from graph data = self.G[edge[0]][edge[1]] self.G.remove_edge(*edge) # Check if graph is still connected #TODO: We shouldn't be using a private function for bfs reachable_from_v1 = nx.connected._plain_bfs(self.G, edge[0]) if edge[1] not in reachable_from_v1: self.G.add_edge(*edge, **data) n_ignored_count += 1 else: pos_edge_list.append(edge) print("Found: %d " % (n_count), end="\r") n_count += 1 # Exit if we've found npos nodes or we have gone through the whole list if n_count >= npos: break if len(pos_edge_list) < npos: raise RuntimeWarning("Only %d positive edges found." % (n_count)) self._pos_edge_list = pos_edge_list self._neg_edge_list = neg_edge_list
def to_dataframe(self, pairs=False, sampling=None, label_graph=None, cheat=False, allow_hashtags=False, min_katz=0, verbose=True, katz=None): """ Get a dataframe for pairs of nodes in the graph :param pairs: True to consider all pairs, False to consider only non-edges, or a list of tuples to use as pairs :param sampling: Amount to sample (default=None, do not sample) :param label_graph: Graph to use to generate the true labels. Usually the next in the time series. :param cheat: Do not sample when label_graph has an edge for a given pair (default=False) :param allow_hashtags: Also predict links between users and hashtags (default=False) :param min_katz: Use a katz threshold to reduce numbers of pairs :param verbose: Display updates (default=True) :param katz: Precomputed katz centrality dictionary (default=None, compute katz before generating dataframe) :return: A pandas dataframe containing pairs and the various calculated metrics """ if not sampling: sampling = 2 u = [] v = [] has_links = [] jac_co = [] adam = [] att = [] nbrs = [] spl = [] katz_centralities = [] count = 0 labels = [] katzes = [] embeddings = [] if self.embeddings: for _ in self.emb_cols: embeddings.append([]) # degree = nx.degree(graph) if type(pairs) is bool and pairs: iter_set = self.all_pairs() elif type(pairs) is bool and not pairs: iter_set = nx.non_edges(self.nx_graph) else: iter_set = pairs if verbose and not katz: print("Precomputing katzes....") if not katz: katz = nx.katz_centrality(self.nx_graph, alpha=.005, beta=.1, tol=.00000001, max_iter=5000) elim = 0 for n1, n2 in iter_set: if random.random() < sampling or (cheat and label_graph and label_graph.nx_graph.has_edge(n1, n2)): count += 1 if verbose: if count % 10000 == 0: print("%d checked... " % count) # k_s = np.mean((katz[n1], katz[n2])) #if k_s < min_katz: # elim += 1 # continue u.append(n1) v.append(n2) # (jaccard, adamic, n_nbrs, attachment) = self.get_unsupported(n1, n2) # jac_co.append(jaccard) # adam.append(adamic) # nbrs.append(n_nbrs) # att.append(attachment) # spl.append(self.get_sp(n1, n2)) # katz_centralities.append(np.mean((katz[n1], katz[n2]))) labels.append(label_graph.nx_graph.has_edge(n1, n2)) #if self.katz: # katzes.append(self.katz[n1][n2]) if self.embeddings: for i in range(0, len(self.emb_cols)): embeddings[i].append(np.mean((self.embeddings[n1][i], self.embeddings[n2][i]))) # embeddings[i].append((self.embeddings[n1][i] * self.embeddings[n2][i])) df = pd.DataFrame() df['u'] = u df['v'] = v # df['jac'] = jac_co # df['adam'] = adam # df['nbrs'] = nbrs # df['att'] = att # df['spl'] = spl # df['katz_centrality'] = katz_centralities # if self.katz: # df['katz'] = katzes if self.embeddings: for i, col in enumerate(self.emb_cols): df[col] = embeddings[i] if verbose: print("\t%d pairs checked and %d pairs in dataframe" % (count, df.shape[0])) df.sample(frac=1) return df, labels
# G.add_edge(3,5) # G.add_edge(3,4) # G.add_edge(3,6) # G.add_edge(4,6) G = nx.Graph() G.add_edge(1, 2) G.add_edge(1, 3) G.add_edge(1, 4) G.add_edge(2, 4) G.add_edge(2, 5) G.add_edge(5, 6) G.add_edge(5, 7) G.add_edge(6, 7) G.add_edge(6, 8) G.add_edge(7, 8) ebunch = nx.non_edges(G) sim_dict = {} for u, v in ebunch: s = len(list(nx.common_neighbors(G, u, v))) sim_dict[(u, v)] = s print(sim_dict[(u, v)]) #MI(G) # ============================================================================= # G = nx.read_edgelist('J:\\Python\\LinkPrediction\\Networks\\test\\test.edgelist') # nx.draw_networkx(G) # plt.show() # =============================================================================