def new_func(*args, **kw): # Here we assume that the first two arguments are (G, partition). if not is_partition(*args[:2]): raise nx.NetworkXError( "`partition` is not a valid partition of" " the nodes of G" ) return func(*args, **kw)
def _require_partition(G, partition): """Decorator to check that a valid partition is input to a function Raises :exc:`networkx.NetworkXError` if the partition is not valid. This decorator should be used on functions whose first two arguments are a graph and a partition of the nodes of that graph (in that order):: >>> @require_partition ... def foo(G, partition): ... print("partition is valid!") ... >>> G = nx.complete_graph(5) >>> partition = [{0, 1}, {2, 3}, {4}] >>> foo(G, partition) partition is valid! >>> partition = [{0}, {2, 3}, {4}] >>> foo(G, partition) Traceback (most recent call last): ... networkx.exception.NetworkXError: `partition` is not a valid partition of the nodes of G >>> partition = [{0, 1}, {1, 2, 3}, {4}] >>> foo(G, partition) Traceback (most recent call last): ... networkx.exception.NetworkXError: `partition` is not a valid partition of the nodes of G """ if is_partition(G, partition): return G, partition raise nx.NetworkXError( "`partition` is not a valid partition of the nodes of G")
def kernighan_lin_bisection(G, partition=None, max_iter=100, weight='weight'): # If no partition is provided, split the nodes randomly into a # balanced partition. for div in range(2, 8): if partition is None: nodes = list(G) random.shuffle(nodes) h = len(nodes) // div partition = (nodes[:h], nodes[h:]) # Make a copy of the partition as a pair of sets. try: A, B = set(partition[0]), set(partition[1]) except: raise ValueError('partition must be two sets') if not is_partition(G, (A, B)): raise nx.NetworkXError('partition invalid') for i in range(max_iter): # `gains` is a list of triples of the form (g, u, v) for each # node pair (u, v), where `g` is the gain of that node pair. gains = _kernighan_lin_pass(G, A, B, weight) csum = list(nx.utils.accumulate(g for g, u, v in gains)) max_cgain = max(csum) if max_cgain <= 0: break # Get the node pairs up to the index of the maximum cumulative # gain, and collect each `u` into `anodes` and each `v` into # `bnodes`, for each pair `(u, v)`. index = csum.index(max_cgain) nodesets = islice(zip(*gains[:index + 1]), 1, 3) anodes, bnodes = (set(s) for s in nodesets) A |= bnodes A -= anodes B |= anodes B -= bnodes print(str(i) + '/' + str((max_iter))) color = np.zeros(len(input_nodes)) for q in range(len(np.array(list(A)))): color[np.where(input_nodes == np.array(list(A))[q])] = aa nx.draw_networkx(g, with_labels=True, node_color=color, pos=p, node_size=100, font_size=3, font_color='w') tEnd = time.time() plt.title('ratio:' + str(div) + ' epoch:' + str(i) + ' time:' + str(int(tEnd - tStart))) plt.savefig( 'H:/master/code/python/networkScience/week10/pic_kl/{:03d}{}.png' .format(i, aa), format='png') plt.clf() # plt.show() return A, B
def test_generator(): n = 250 tau1 = 3 tau2 = 1.5 mu = 0.1 G = LFR_benchmark_graph(n, tau1, tau2, mu, average_degree=5, min_community=20, seed=10) assert_equal(len(G), 250) C = {frozenset(G.node[v]['community']) for v in G} assert_true(is_partition(G.nodes(), C))
def test_generator(): n = 250 tau1 = 3 tau2 = 1.5 mu = 0.1 G = LFR_benchmark_graph(n, tau1, tau2, mu, average_degree=5, min_community=20, seed=10) assert_equal(len(G), 250) C = {frozenset(G.nodes[v]['community']) for v in G} assert_true(is_partition(G.nodes(), C))
def kernighan_lin_bisection(G, partition=None, max_iter=10, weight="weight", seed=None): n = len(G) labels = list(G) seed.shuffle(labels) index = {v: i for i, v in enumerate(labels)} if partition is None: side = [0] * (n // 2) + [1] * ((n + 1) // 2) else: try: A, B = partition except (TypeError, ValueError) as e: raise nx.NetworkXError("partition must be two sets") from e if not is_partition(G, (A, B)): raise nx.NetworkXError("partition invalid") side = [0] * n for a in A: side[index[a]] = 1 if G.is_multigraph(): edges = [ [ (index[u], sum(e.get(weight, 1) for e in d.values())) for u, d in G[v].items() ] for v in labels ] else: edges = [ [(index[u], e.get(weight, 1)) for u, e in G[v].items()] for v in labels ] for i in range(max_iter): costs = list(_kernighan_lin_sweep(edges, side)) min_cost, min_i, _ = min(costs) if min_cost >= 0: break for _, _, (u, v) in costs[: min_i + 1]: side[u] = 1 side[v] = 0 A = {u for u, s in zip(labels, side) if s == 0} B = {u for u, s in zip(labels, side) if s == 1} return A, B
def modularity(G, communities, weight='weight'): r"""Returns the modularity of the given partition of the graph. Modularity is defined in [1]_ as .. math:: Q = \frac{1}{2m} \sum_{ij} \left( A_{ij} - \frac{k_ik_j}{2m}\right) \delta(c_i,c_j) where $m$ is the number of edges, $A$ is the adjacency matrix of `G`, $k_i$ is the degree of $i$ and $\delta(c_i, c_j)$ is 1 if $i$ and $j$ are in the same community and 0 otherwise. Parameters ---------- G : NetworkX Graph communities : list List of sets of nodes of `G` representing a partition of the nodes. Returns ------- Q : float The modularity of the paritition. Raises ------ NotAPartition If `communities` is not a partition of the nodes of `G`. Examples -------- >>> G = nx.barbell_graph(3, 0) >>> nx.algorithms.community.modularity(G, [{0, 1, 2}, {3, 4, 5}]) 0.35714285714285704 References ---------- .. [1] M. E. J. Newman *Networks: An Introduction*, page 224. Oxford University Press, 2011. """ if not is_partition(G, communities): raise NotAPartition(G, communities) multigraph = G.is_multigraph() directed = G.is_directed() m = G.size(weight=weight) if directed: out_degree = dict(G.out_degree(weight=weight)) in_degree = dict(G.in_degree(weight=weight)) norm = 1 / m else: out_degree = dict(G.degree(weight=weight)) in_degree = out_degree norm = 1 / (2 * m) def val(u, v): try: if multigraph: w = sum(d.get(weight, 1) for k, d in G[u][v].items()) else: w = G[u][v].get(weight, 1) except KeyError: w = 0 # Double count self-loops if the graph is undirected. if u == v and not directed: w *= 2 return w - in_degree[u] * out_degree[v] * norm Q = sum(val(u, v) for c in communities for u, v in product(c, repeat=2)) return Q * norm
def main(): # Column name col_name = "ALGORITHM_cmty" # Load data if path.exists("../data/cmty_nodes.csv"): node_upload = "../data/cmty_nodes.csv" elif path.exists("../data/nodes.csv"): node_upload = "../data/nodes.csv" else: print("NO NODES TO UPLOAD!") assert (False) pd_nodes = pd.read_csv(node_upload, sep='\t', index_col=0) # Data in nice form headers = list(pd_nodes.columns) nodes = np.asarray(pd_nodes) # Aggregate file names model_names = ["GAT", "GCN", "GraphSage"] npy_names = ["../data/" + x + "_node_embeddings.npy" for x in model_names] model_cmtys = [] model_time = [] for i in range(len(npy_names)): # Load embeddings embeddings = np.load(npy_names[i]) print(embeddings.shape) # Generate node_mapping for clutsers start = timeit.default_timer() ########################################## # CODE HERE to cluster embeddings and creating node_mapping # # node_mapping can either be dictionary or array # ########################################## node_mapping = np.zeros(len(nodes)).astype(int) ########################################## stop = timeit.default_timer() model_time.append(stop - start) # Convert node_mapping to cmtys and node_to_cmty array #num_cmtys = len(set(node_mapping.values())) num_cmtys = len(set(node_mapping)) cmtys = [[] for _ in range(num_cmtys)] node_to_cmty = np.zeros(len(node_mapping)).astype(int) for j in range(len(node_to_cmty)): node_to_cmty[j] = node_mapping[j] cmtys[node_mapping[j]].append(j) model_cmtys.append(cmtys) # Add communities to nodes pd_nodes[model_names[i] + "_" + col_name] = node_to_cmty pd_nodes.to_csv("../data/cmty_nodes.csv", sep='\t') print("Creating Graph") # Load social network accordingly edges = pd.read_csv("../data/edges.csv", sep='\t', index_col=0) edges = np.asarray(edges).astype(int) G = nx.Graph() G.add_nodes_from(range(nodes.shape[0])) G.add_edges_from(list(map(tuple, edges))) print("Calculating modularity") for i in range(len(model_names)): assert (is_partition(G, model_cmtys[i])) modul = modularity(G, model_cmtys[i]) print("Results from " + model_names[i] + " ALGORITHM:") print("Modularity:", modul) print("Number of clusters:", len(model_cmtys[i])) print("Time elapsed:", model_time[i])
def kernighan_lin_bisection(G, partition=None, max_iter=10, weight='weight', seed=None): """Partition a graph into two blocks using the Kernighan–Lin algorithm. This algorithm partitions a network into two sets by iteratively swapping pairs of nodes to reduce the edge cut between the two sets. The pairs are chosen according to a modified form of Kernighan-Lin, which moves node individually, alternating between sides to keep the bisection balanced. Parameters ---------- G : graph partition : tuple Pair of iterables containing an initial partition. If not specified, a random balanced partition is used. max_iter : int Maximum number of times to attempt swaps to find an improvemement before giving up. weight : key Edge data key to use as weight. If None, the weights are all set to one. seed : integer, random_state, or None (default) Indicator of random number generation state. See :ref:`Randomness<randomness>`. Only used if partition is None Returns ------- partition : tuple A pair of sets of nodes representing the bipartition. Raises ------- NetworkXError If partition is not a valid partition of the nodes of the graph. References ---------- .. [1] Kernighan, B. W.; Lin, Shen (1970). "An efficient heuristic procedure for partitioning graphs." *Bell Systems Technical Journal* 49: 291--307. Oxford University Press 2011. """ n = len(G) labels = list(G) seed.shuffle(labels) index = {v: i for i, v in enumerate(labels)} if partition is None: side = [0] * (n // 2) + [1] * ((n + 1) // 2) else: try: A, B = partition except (TypeError, ValueError): raise nx.NetworkXError('partition must be two sets') if not is_partition(G, (A, B)): raise nx.NetworkXError('partition invalid') side = [0] * n for a in A: side[a] = 1 if G.is_multigraph(): edges = [[(index[u], sum(e.get(weight, 1) for e in d.values())) for u, d in G[v].items()] for v in labels] else: edges = [[(index[u], e.get(weight, 1)) for u, e in G[v].items()] for v in labels] for i in range(max_iter): costs = list(_kernighan_lin_sweep(edges, side)) min_cost, min_i, _ = min(costs) if min_cost >= 0: break for _, _, (u, v) in costs[:min_i + 1]: side[u] = 1 side[v] = 0 A = set(u for u, s in zip(labels, side) if s == 0) B = set(u for u, s in zip(labels, side) if s == 1) return A, B
def new_func(*args, **kw): # Here we assume that the first two arguments are (G, partition). if not is_partition(*args[:2]): raise nx.NetworkXError('`partition` is not a valid partition of' ' the nodes of G') return func(*args, **kw)
def modularity(G, communities, weight='weight'): r"""Returns the modularity of the given partition of the graph. Modularity is defined in [1]_ as .. math:: Q = \frac{1}{2m} \sum_{ij} \left( A_{ij} - \frac{k_ik_j}{2m}\right) \delta(c_i,c_j) where *m* is the number of edges, *A* is the adjacency matrix of `G`, :math:`k_i` is the degree of *i* and :math:`\delta(c_i, c_j)` is 1 if *i* and *j* are in the same community and 0 otherwise. Parameters ---------- G : NetworkX Graph communities : list List of sets of nodes of `G` representing a partition of the nodes. Returns ------- Q : float The modularity of the paritition. Raises ------ NotAPartition If `communities` is not a partition of the nodes of `G`. Examples -------- >>> G = nx.barbell_graph(3, 0) >>> nx.algorithms.community.modularity(G, [{0, 1, 2}, {3, 4, 5}]) 0.35714285714285704 References ---------- .. [1] M. E. J. Newman *Networks: An Introduction*, page 224. Oxford University Press, 2011. """ if not is_partition(G, communities): raise NotAPartition(G, communities) multigraph = G.is_multigraph() directed = G.is_directed() m = G.size(weight=weight) if directed: out_degree = dict(G.out_degree(weight=weight)) in_degree = dict(G.in_degree(weight=weight)) norm = 1 / m else: out_degree = dict(G.degree(weight=weight)) in_degree = out_degree norm = 1 / (2 * m) def val(u, v): try: if multigraph: w = sum(d.get(weight, 1) for k, d in G[u][v].items()) else: w = G[u][v].get(weight, 1) except KeyError: w = 0 # Double count self-loops if the graph is undirected. if u == v and not directed: w *= 2 return w - in_degree[u] * out_degree[v] * norm Q = sum(val(u, v) for c in communities for u, v in product(c, repeat=2)) return Q * norm
def kernighan_lin_bisection(G, partition=None, max_iter=10, weight='weight', seed=None): """Partition a graph into two blocks using the Kernighan–Lin algorithm. This algorithm paritions a network into two sets by iteratively swapping pairs of nodes to reduce the edge cut between the two sets. Parameters ---------- G : graph partition : tuple Pair of iterables containing an initial partition. If not specified, a random balanced partition is used. max_iter : int Maximum number of times to attempt swaps to find an improvemement before giving up. weight : key Edge data key to use as weight. If None, the weights are all set to one. seed : integer, random_state, or None (default) Indicator of random number generation state. See :ref:`Randomness<randomness>`. Only used if partition is None Returns ------- partition : tuple A pair of sets of nodes representing the bipartition. Raises ------- NetworkXError If partition is not a valid partition of the nodes of the graph. References ---------- .. [1] Kernighan, B. W.; Lin, Shen (1970). "An efficient heuristic procedure for partitioning graphs." *Bell Systems Technical Journal* 49: 291--307. Oxford University Press 2011. """ # If no partition is provided, split the nodes randomly into a # balanced partition. if partition is None: nodes = list(G) seed.shuffle(nodes) h = len(nodes) // 2 partition = (nodes[:h], nodes[h:]) # Make a copy of the partition as a pair of sets. try: A, B = set(partition[0]), set(partition[1]) except: raise ValueError('partition must be two sets') if not is_partition(G, (A, B)): raise nx.NetworkXError('partition invalid') for i in range(max_iter): # `gains` is a list of triples of the form (g, u, v) for each # node pair (u, v), where `g` is the gain of that node pair. gains = _kernighan_lin_pass(G, A, B, weight) csum = list(nx.utils.accumulate(g for g, u, v in gains)) max_cgain = max(csum) if max_cgain <= 0: break # Get the node pairs up to the index of the maximum cumulative # gain, and collect each `u` into `anodes` and each `v` into # `bnodes`, for each pair `(u, v)`. index = csum.index(max_cgain) nodesets = islice(zip(*gains[:index + 1]), 1, 3) anodes, bnodes = (set(s) for s in nodesets) A |= bnodes A -= anodes B |= anodes B -= bnodes return A, B
def modularity(G, communities, weight="weight", resolution=1): r"""Returns the modularity of the given partition of the graph. Modularity is defined in [1]_ as .. math:: Q = \frac{1}{2m} \sum_{ij} \left( A_{ij} - \gamma\frac{k_ik_j}{2m}\right) \delta(c_i,c_j) where $m$ is the number of edges, $A$ is the adjacency matrix of `G`, $k_i$ is the degree of $i$, $\gamma$ is the resolution parameter, and $\delta(c_i, c_j)$ is 1 if $i$ and $j$ are in the same community else 0. According to [2]_ (and verified by some algebra) this can be reduced to .. math:: Q = \sum_{c=1}^{n} \left[ \frac{L_c}{m} - \gamma\left( \frac{k_c}{2m} \right) ^2 \right] where the sum iterates over all communities $c$, $m$ is the number of edges, $L_c$ is the number of intra-community links for community $c$, $k_c$ is the sum of degrees of the nodes in community $c$, and $\gamma$ is the resolution parameter. The resolution parameter sets an arbitrary tradeoff between intra-group edges and inter-group edges. More complex grouping patterns can be discovered by analyzing the same network with multiple values of gamma and then combining the results [3]_. That said, it is very common to simply use gamma=1. More on the choice of gamma is in [4]_. The second formula is the one actually used in calculation of the modularity. For directed graphs the second formula replaces $k_c$ with $k^{in}_c k^{out}_c$. Parameters ---------- G : NetworkX Graph communities : list or iterable of set of nodes These node sets must represent a partition of G's nodes. weight : string or None, optional (default="weight") The edge attribute that holds the numerical value used as a weight. If None or an edge does not have that attribute, then that edge has weight 1. resolution : float (default=1) If resolution is less than 1, modularity favors larger communities. Greater than 1 favors smaller communities. Returns ------- Q : float The modularity of the paritition. Raises ------ NotAPartition If `communities` is not a partition of the nodes of `G`. Examples -------- >>> import networkx.algorithms.community as nx_comm >>> G = nx.barbell_graph(3, 0) >>> nx_comm.modularity(G, [{0, 1, 2}, {3, 4, 5}]) 0.35714285714285715 >>> nx_comm.modularity(G, nx_comm.label_propagation_communities(G)) 0.35714285714285715 References ---------- .. [1] M. E. J. Newman "Networks: An Introduction", page 224. Oxford University Press, 2011. .. [2] Clauset, Aaron, Mark EJ Newman, and Cristopher Moore. "Finding community structure in very large networks." Phys. Rev. E 70.6 (2004). <https://arxiv.org/abs/cond-mat/0408187> .. [3] Reichardt and Bornholdt "Statistical Mechanics of Community Detection" Phys. Rev. E 74, 016110, 2006. https://doi.org/10.1103/PhysRevE.74.016110 .. [4] M. E. J. Newman, "Equivalence between modularity optimization and maximum likelihood methods for community detection" Phys. Rev. E 94, 052315, 2016. https://doi.org/10.1103/PhysRevE.94.052315 """ if not isinstance(communities, list): communities = list(communities) if not is_partition(G, communities): raise NotAPartition(G, communities) directed = G.is_directed() if directed: out_degree = dict(G.out_degree(weight=weight)) in_degree = dict(G.in_degree(weight=weight)) m = sum(out_degree.values()) norm = 1 / m**2 else: out_degree = in_degree = dict(G.degree(weight=weight)) deg_sum = sum(out_degree.values()) m = deg_sum / 2 norm = 1 / deg_sum**2 def community_contribution(community): comm = set(community) L_c = sum(wt for u, v, wt in G.edges(comm, data=weight, default=1) if v in comm) out_degree_sum = sum(out_degree[u] for u in comm) in_degree_sum = sum(in_degree[u] for u in comm) if directed else out_degree_sum return L_c / m - resolution * out_degree_sum * in_degree_sum * norm return sum(map(community_contribution, communities))
def main(): # Load data if path.exists("../data/cmty_nodes.csv"): node_upload = "../data/cmty_nodes.csv" elif path.exists("../data/nodes.csv"): node_upload = "../data/nodes.csv" else: print("NO NODES TO UPLOAD!") assert(False) pd_nodes = pd.read_csv(node_upload, sep='\t', index_col=0) # Data in nice form headers = list(pd_nodes.columns) nodes = np.asarray(pd_nodes) # Load social network accordingly if path.exists("../data/youtube.graph"): FIn = snap.TFIn("../data/youtube.graph") social_network = snap.TNGraph.Load(FIn) else: edges = pd.read_csv("../data/edges.csv", sep='\t', index_col=0) edges = np.asarray(edges).astype(int) social_network = data2dag(edges, nodes.shape[0]) # Check for self edges for e in social_network.Edges(): if e.GetSrcNId() == e.GetDstNId(): print("Self Loop Found:",e.GetSrcNId()) # CNM Algorithm from snap.py print("Computing CNM") start = timeit.default_timer() CmtyV = snap.TCnComV() undirected = snap.ConvertGraph(snap.PUNGraph, social_network) snap.DelSelfEdges(undirected) the_modularity = snap.CommunityCNM(undirected, CmtyV) stop = timeit.default_timer() node_to_cmty = np.zeros(nodes.shape[0]).astype(int) cmty_sizes = np.zeros(len(CmtyV)) for i in range(len(CmtyV)): for node in CmtyV[i]: node_to_cmty[node] = i cmty_sizes[i] = len(CmtyV[i]) cmtys = [[node for node in cmty] for cmty in CmtyV] ''' m = 0 for i in range(len(CmtyV)): Nodes = snap.TIntV() for elem in CmtyV[i]: Nodes.Add(int(elem)) m += snap.GetModularity(social_network, Nodes, social_network.GetEdges()) ''' edges = pd.read_csv("../data/edges.csv", sep='\t', index_col=0) edges = np.asarray(edges).astype(int) G = nx.Graph() G.add_nodes_from(range(nodes.shape[0])) G.add_edges_from(list(map(tuple, edges))) # Add communities to nodes col_name = "cnm_cmty" pd_nodes[col_name] = node_to_cmty pd_nodes.to_csv("../data/cmty_nodes.csv", sep='\t') assert(is_partition(G, cmtys)) print("Calculating Modularity") modul = modularity(G, cmtys) print("Results from Clauset-Newman-Moore:") print("Modularity:",modul) print("Number of clusters:",len(CmtyV)) print("Time elapsed:",stop - start) # Fun category stuff to do ''' upload_col = headers.index('category') categories = set() for i in range(nodes.shape[0]): categories.add(nodes[i][upload_col]) idx_to_categories = list(categories) print("Number of categories:",len(idx_to_categories)) categories_to_idx = dict() for i in range(len(idx_to_categories)): categories_to_idx[idx_to_categories[i]] = i # Communities and categories cmty_category_count = np.zeros((len(CmtyV),len(idx_to_categories))) for i in range(nodes.shape[0]): cmty_category_count[int(node_to_cmty[i]),categories_to_idx[nodes[i][upload_col]]] += 1 cmty_category_count = cmty_category_count/cmty_sizes[:,np.newaxis] ''' # Create graphs per category ''' plt.figure() for i in range(len(idx_to_categories)): if (str(idx_to_categories[i]) != "nan") and (idx_to_categories[i] != " UNA "): plt.plot(sorted(cmty_category_count[:,i], reverse=True), label=idx_to_categories[i]) plt.title("Category Proportions in Clusters") plt.xlabel("Cluster") plt.ylabel("Proportion") plt.legend(bbox_to_anchor=(1.04,1), loc="upper left") plt.savefig("../figures/category_proportions_clusters.png", bbox_inches="tight") ''' ''' for i in range(cmty_category_count.shape[0]): top_category = np.argmax(cmty_category_count[i]) print("Community "+str(i)+": "+str(idx_to_categories[top_category])+",",cmty_category_count[i][top_category]) ''' '''
def kernighan_lin_bisection(G, partition=None, max_iter=10, weight='weight', seed=None): """Partition a graph into two blocks using the Kernighan–Lin algorithm. This algorithm paritions a network into two sets by iteratively swapping pairs of nodes to reduce the edge cut between the two sets. Parameters ---------- G : graph partition : tuple Pair of iterables containing an initial partition. If not specified, a random balanced partition is used. max_iter : int Maximum number of times to attempt swaps to find an improvemement before giving up. weight : key Edge data key to use as weight. If None, the weights are all set to one. seed : integer, random_state, or None (default) Indicator of random number generation state. See :ref:`Randomness<randomness>`. Only used if partition is None Returns ------- partition : tuple A pair of sets of nodes representing the bipartition. Raises ------- NetworkXError If partition is not a valid partition of the nodes of the graph. References ---------- .. [1] Kernighan, B. W.; Lin, Shen (1970). "An efficient heuristic procedure for partitioning graphs." *Bell Systems Technical Journal* 49: 291--307. Oxford University Press 2011. """ # If no partition is provided, split the nodes randomly into a # balanced partition. if partition is None: nodes = list(G) seed.shuffle(nodes) h = len(nodes) // 2 partition = (nodes[:h], nodes[h:]) # Make a copy of the partition as a pair of sets. try: A, B = set(partition[0]), set(partition[1]) except: raise ValueError('partition must be two sets') if not is_partition(G, (A, B)): raise nx.NetworkXError('partition invalid') for i in range(max_iter): # `gains` is a list of triples of the form (g, u, v) for each # node pair (u, v), where `g` is the gain of that node pair. gains = _kernighan_lin_pass(G, A, B, weight) csum = list(accumulate(g for g, u, v in gains)) max_cgain = max(csum) if max_cgain <= 0: break # Get the node pairs up to the index of the maximum cumulative # gain, and collect each `u` into `anodes` and each `v` into # `bnodes`, for each pair `(u, v)`. index = csum.index(max_cgain) nodesets = islice(zip(*gains[:index + 1]), 1, 3) anodes, bnodes = (set(s) for s in nodesets) A |= bnodes A -= anodes B |= anodes B -= bnodes return A, B
def modularity(G, communities, weight="weight"): r"""Returns the modularity of the given partition of the graph. Modularity is defined in [1]_ as .. math:: Q = \frac{1}{2m} \sum_{ij} \left( A_{ij} - \frac{k_ik_j}{2m}\right) \delta(c_i,c_j) where $m$ is the number of edges, $A$ is the adjacency matrix of `G`, $k_i$ is the degree of $i$ and $\delta(c_i, c_j)$ is 1 if $i$ and $j$ are in the same community and 0 otherwise. According to [2]_ (and verified by some algebra) this can be reduced to .. math:: Q = \sum_{c=1}^{n} \left[ \frac{L_c}{m} - \left( \frac{k_c}{2m} \right) ^2 \right] where the sum iterates over all communities $c$, $m$ is the number of edges, $L_c$ is the number of intra-community links for community $c$, $k_c$ is the sum of degrees of the nodes in community $c$. The second formula is the one actually used in calculation of the modularity. Parameters ---------- G : NetworkX Graph communities : list or iterable of set of nodes These node sets must represent a partition of G's nodes. weight : string or None, optional (default="weight") The edge attribute that holds the numerical value used as a weight. If None or an edge does not have that attribute, then that edge has weight 1. Returns ------- Q : float The modularity of the paritition. Raises ------ NotAPartition If `communities` is not a partition of the nodes of `G`. Examples -------- >>> import networkx.algorithms.community as nx_comm >>> G = nx.barbell_graph(3, 0) >>> nx_comm.modularity(G, [{0, 1, 2}, {3, 4, 5}]) 0.35714285714285715 >>> nx_comm.modularity(G, nx_comm.label_propagation_communities(G)) 0.35714285714285715 References ---------- .. [1] M. E. J. Newman *Networks: An Introduction*, page 224. Oxford University Press, 2011. .. [2] Clauset, Aaron, Mark EJ Newman, and Cristopher Moore. "Finding community structure in very large networks." Physical review E 70.6 (2004). <https://arxiv.org/abs/cond-mat/0408187> """ if not isinstance(communities, list): communities = list(communities) if not is_partition(G, communities): raise NotAPartition(G, communities) directed = G.is_directed() if directed: out_degree = dict(G.out_degree(weight=weight)) in_degree = dict(G.in_degree(weight=weight)) m = sum(out_degree.values()) norm = 1 / m**2 else: out_degree = in_degree = dict(G.degree(weight=weight)) deg_sum = sum(out_degree.values()) m = deg_sum / 2 norm = 1 / deg_sum**2 def community_contribution(community): comm = set(community) L_c = sum(wt for u, v, wt in G.edges(comm, data=weight, default=1) if v in comm) out_degree_sum = sum(out_degree[u] for u in comm) in_degree_sum = sum(in_degree[u] for u in comm) if directed else out_degree_sum return L_c / m - out_degree_sum * in_degree_sum * norm return sum(map(community_contribution, communities))
def silhouettes(G, particion, silencioso=False): """ Calcula el valor de silhouette para cada nodo del grafo 'G' dada una partición 'particion' como lista de listas. Dicho valor está dado por s(i) = (b(i) - a(i)) / max(a(i), b(i)) donde a(i) es la distancia media a todos los nodos del mismo cluster que i y b(i) es la mínima de las distancias medias a los distintos clusters a los cuales no pertenece i. Para mayor claridad, sea c_i el cluster al que pertenece i, y sea Q = particion - c_i el conjunto de los clusters a los cuales no pertenece i. Entonces se define b(i) = min{promedio{d(i,j) : j in cluster} : cluster in Q} b(i) también se suele llamar "distancia media al cluster más cercano". Input ----- G : nx.Graph particion : list lista de listas. Cada sublista es un cluster y sus elementos son los nombres de los nodos que pertenecen a dicho cluster. Output ------ output : list lista de listas. Cada sublista es un cluster y sus elementos son los valores de silhouette para cada nodo, preservando el orden del input. """ if not is_partition(G, particion): raise NotAPartition(G, particion) ds = list(nx.all_pairs_shortest_path_length(G)) d = lambda i, j: ds[i][1][j] # ds[i][1][j] es la distancia (longitud del camino más corto) # entre i y j n = G.order() nc = len(particion) # Creamos lista de lista con iguales longitudes que 'particion' s_values = [[[] for n in range(len(particion[m]))] for m in range(nc)] # Las listas vacías son "dummies" o "placeholders" para los valores # de silhouette, que irán reemplazándolas. nodos_to_indices = crear_nodos_to_indices(particion) # Recorremos los nodos en el ordenamiento global correspondiente # a la función distancia 'd' for i, nodo in enumerate(G.nodes()): m, n = nodos_to_indices[nodo] cluster_actual = particion[m] otros_clusters = (particion[l] for l in range(nc) if l != m) a = np.average([d(i, j) for j in cluster_actual]) try: dists_interclusters = [np.average([d(i,j) for j in cluster if j != i]) \ for cluster in otros_clusters] except KeyError: if not silencioso: print( 'El grafo no es conexo y la distancia entre algunos clusters', 'es infinita por lo que no se puede realizar por completo el', 'análisis de silhouettes. Devolviendo lista vacía.') return [] try: b = min(dists_interclusters) except ValueError: if not silencioso: print( 'La partición tiene un solo elemento. Devolviendo lista vacía.' ) return [] s_values[m][n] = (b - a) / max(a, b) return s_values
def main(): # Load data if path.exists("../data/cmty_nodes.csv"): node_upload = "../data/cmty_nodes.csv" elif path.exists("../data/nodes.csv"): node_upload = "../data/cmty_nodes.csv" else: print("NO NODES TO UPLOAD!") assert(False) pd_nodes = pd.read_csv(node_upload, sep='\t', index_col=0) # Data in nice form headers = list(pd_nodes.columns) nodes = np.asarray(pd_nodes) # Load social network accordingly edges = pd.read_csv("../data/edges.csv", sep='\t', index_col=0) edges = np.asarray(edges).astype(int) G = nx.Graph() G.add_nodes_from(range(nodes.shape[0])) G.add_edges_from(list(map(tuple, edges))) #first compute the best partition print("Computing Louvain Algorithm") start = timeit.default_timer() partition = community.best_partition(G) stop = timeit.default_timer() # Computing modularity num_cmtys = len(set(partition.values())) num_edges = edges.shape[0] cmtys = [[] for _ in range(num_cmtys)] node_to_cmty = np.zeros(len(partition)).astype(int) for i in range(len(node_to_cmty)): node_to_cmty[i] = partition[i] cmtys[partition[i]].append(i) # Load social network accordingly if path.exists("../data/youtube.graph"): FIn = snap.TFIn("../data/youtube.graph") social_network = snap.TNGraph.Load(FIn) else: social_network = data2dag(edges, nodes.shape[0]) # Add communities to nodes col_name = "louvain_cmty" pd_nodes[col_name] = node_to_cmty pd_nodes.to_csv("../data/cmty_nodes.csv", sep='\t') ''' modularity = 0 for cmty in cmtys: Nodes = snap.TIntV() for elem in cmty: Nodes.Add(int(elem)) modularity += snap.GetModularity(social_network, Nodes, num_edges) ''' print("Calculating Modularity") assert(is_partition(G, cmtys)) modul = modularity(G, cmtys) print("Results from Louvain:") print("Modularity:",modul) print("Number of clusters:",num_cmtys) print("Time elapsed:",stop - start) #drawing '''