def extract_with_density(self, document: str) -> Sequence[Tuple[str, float]]: """Extraction of keywords corresponding to the nodes of the k-core satisfying a density criterion Density criterion consists in applying the elbow method when going down the k-core """ # Building the graph-of-words gow = self.builder.compute_gow_from_document(document) if len(gow.nodes) > 0: graph = gow.to_graph() # Computation of the k-cores if self.builder.weighted: kcore_number = core_number_weighted(graph) else: kcore_number = nx_core_number(graph) # Sorted sequence of k for each k-core ks = sorted({k for _, k in kcore_number.items()}) # Storage for (i, density) densities = [] # Mapping between i and the k-core value i_to_k = {} # Storage of k-core graph for each k k_graphs = {} # Going DOWN the k-core and computation of the k-core densities for i, k in enumerate(reversed(ks)): g_k = k_core(graph, k=k, core_number=kcore_number) k_graphs[k] = g_k i_to_k[i] = k densities.append((i, density(g_k))) # Retrieving the most appropriate density via the elbow method i_k_best = elbow(densities) # Retrieving the corresponding k k_best = i_to_k[i_k_best] # Retrieving the keywords for k-core with k=k_best keywords = [] best_graph = k_graphs[k_best] for v in best_graph.nodes: token_code = best_graph.nodes[v]['label'] token = self.builder.get_token_(token_code) k = kcore_number[v] keywords.append((token, k)) return sorted(keywords, key=lambda p: p[1], reverse=True) else: return []
# [4] Component Size # Def : a connected component of a graph as a subgraph of a simple graph G in # which every vertex is connected to every other vertex in the subgraph by a path. # Def : the number of nodes in the connected component that contains i # Outputs an int. component_size = C.number_of_nodes() number_connected_components = connected_components.number_connected_components( G) # [5] Component Density # Def : the number of edges in the graph divided by the number of total possible # edges the graph might have. # Outputs a float. component_density = component_density.density(C) # [6] Geodesic Distance # Def : the number of edges on the shortest path between two vertices. We want # the average geodesic distance for the component containing i. # geodesic_distance.closeness_centrality(C) # Outputs a dictionary of nodes with closeness_centrality as the value. Reciprocal # of how defined in latex document. # We want : the average geodesic distance in the component where the initial infection # is fist introduced # Outputs a float representing the average geodesic distance in the component where # the initial infection is fist introduced # AGD was computed by summing all geodesic lengths and dividing by the number of geodesics. average_geodesic_dist = geodesic_distance.average_shortest_path_length(C)
for fname in os.listdir("output"): print(fname) try: G = read_dot(os.path.join("output", fname)) nx.draw(G) except: print("cannot load graph") continue if G.number_of_nodes() == 0: print("Cannot read binary file") continue data = [] data.append(fname) data.append(G.number_of_nodes()) data.append(G.number_of_edges()) data.append(density(G)) deg_centrality = degree_centrality(G) data.extend(properties_of_array(deg_centrality)) cln_centrality = closeness_centrality(G) data.extend(properties_of_array(cln_centrality)) btn_centrality = betweenness_centrality(G) data.extend(properties_of_array(btn_centrality)) st_path = shortest_path(G) deg = [len(val) for key, val in st_path.items()] d = np.array(deg) data.extend( [np.min(d), np.max(d), np.median(d), np.mean(d), np.std(d)])
def ver_medidas(G): print(function.info(G)) """ Numero minimo de nodos que deben ser removidos para desconectar G """ print("Numero minimo de nodos que deben ser removidos para desconectar G :"+str(approximation.node_connectivity(G))) """ average clustering coefficient of G. """ print("average clustering coefficient of G: "+str(approximation.average_clustering(G))) """ Densidad de un Grafo """ print("Densidad de G: "+str(function.density(G))) """ Assortativity measures the similarity of connections in the graph with respect to the node degree. Valores positivos de r indican que existe una correlacion entre nodos con grado similar, mientras que un valor negativo indica correlaciones entre nodos de diferente grado """ print("degree assortativity:"+str(assortativity.degree_assortativity_coefficient(G))) """ Assortativity measures the similarity of connections in the graph with respect to the given attribute. """ print("assortativity for node attributes: "+str(assortativity.attribute_assortativity_coefficient(G,"crime"))) """ Grado promedio vecindad """ plt.plot(assortativity.average_neighbor_degree(G).values()) plt.title("Grado promedio vecindad") plt.xlabel("Nodo") plt.ylabel("Grado") plt.show(); """ Grado de Centralidad de cada nodo """ plt.plot(centrality.degree_centrality(G).values()) plt.title("Grado de centralidad") plt.xlabel("Nodo") plt.ylabel("Centralidad") plt.show(); """ Calcular el coeficiente de agrupamiento para nodos """ plt.plot(cluster.clustering(G).values()) plt.title("coeficiente de agrupamiento") plt.xlabel("Nodo") plt.show(); """ Media coeficiente de Agrupamiento """ print("Coeficiente de agrupamiento de G:"+str(cluster.average_clustering(G))) """ Centro del grafo El centro de un grafo G es el subgrafo inducido por el conjunto de vertices de excentricidad minima. La excentricidad de v in V se define como la distancia maxima desde v a cualquier otro vertice del grafo G siguiendo caminos de longitud minima. """ print("Centro de G:"+ str(distance_measures.center(G))) """ Diametro de un grafo The diameter is the maximum eccentricity. """ print("Diametro de G:"+str(distance_measures.diameter(G))) """ Excentricidad de cada Nodo The eccentricity of a node v is the maximum distance from v to all other nodes in G. """ plt.plot(distance_measures.eccentricity(G).values()) plt.title("Excentricidad de cada Nodo") plt.xlabel("Nodo") plt.show(); """ Periferia The periphery is the set of nodes with eccentricity equal to the diameter. """ print("Periferia de G:") print(distance_measures.periphery(G)) """ Radio The radius is the minimum eccentricity. """ print("Radio de G:"+str(distance_measures.radius(G))) """ PageRank calcula una clasificacion de los nodos en el grafico G en funcion de la estructura de los enlaces entrantes. Originalmente fue disenado como un algoritmo para clasificar paginas web. """ plt.plot(link_analysis.pagerank_alg.pagerank(G).values()) plt.title("Puntaje de cada Nodo") plt.xlabel("Nodo") plt.show(); """ Coeficiente de Small World. A graph is commonly classified as small-world if sigma>1. """ print("Coeficiente de Small World: " + str(smallworld.sigma(G))) """ The small-world coefficient (omega) ranges between -1 and 1. Values close to 0 means the G features small-world characteristics. Values close to -1 means G has a lattice shape whereas values close to 1 means G is a random graph. """ print("Omega coeficiente: "+str(smallworld.omega(G)))
def compute_directed_graph_metrics(G): assert type(G) is nx.DiGraph n_edges = len(G.edges) # in & out degree stats in_degrees = np.array([n for _, n in G.in_degree()]) out_degrees = np.array([n for _, n in G.out_degree()]) in_degrees_k_freq = np.unique(in_degrees, return_counts=True)[1] out_degrees_k_freq = np.unique(out_degrees, return_counts=True)[1] out_in_degrees_corr = numeric_attribute_correlation( G, dict(G.out_degree), dict(G.in_degree)) # dyad metrics dyad_freq = dyadic_census(G) dyad_metrics = compute_dyad_metrics(dyad_freq) # reciprocity reciprocity = None if n_edges > 0: # based on networkx definition reciprocity = 2 * dyad_freq["2"] / (dyad_freq["1"] + 2 * dyad_freq["2"]) # clustering global_clustering = transitivity(G) local_clustering_mean = average_clustering(G) # fraction of connected node pairs (any path len) f_connected_node_pairs = fraction_of_connected_node_pairs(G) # centralization cent_metrics = centralization_metrics(G, prefix="_di") metrics = { "n_edges_di": len(G.edges), "density_di": density(G), "reciprocity": reciprocity, # in_degree "in_degrees_mean": safe(np.mean, in_degrees), "in_degrees_var": safe(np.var, in_degrees), "in_degrees_hidx": safe(h_index, in_degrees), "in_degrees_gini": safe(gini, in_degrees + eps), "in_degrees_f0": safe(np.mean, (in_degrees == 0)), "in_degrees_pk_ent": entropy(in_degrees_k_freq), "in_degrees_pk_gini": gini(in_degrees_k_freq), # out_degree "out_degrees_mean": safe(np.mean, out_degrees), "out_degrees_var": safe(np.var, out_degrees), "out_degrees_hidx": safe(h_index, out_degrees), "out_degrees_gini": safe(gini, out_degrees + eps), "out_degrees_f0": safe(np.mean, (out_degrees == 0)), "out_degrees_pk_ent": entropy(out_degrees_k_freq), "out_degrees_pk_gini": gini(out_degrees_k_freq), # degree assortativity "out_in_degrees_corr": out_in_degrees_corr, # dyad metric **dyad_metrics, # fraction of connected node pairs with path of any length "f_connected_node_pairs_di": f_connected_node_pairs, # clustering coefficients "global_clustering_di": global_clustering, "local_clustering_mean_di": local_clustering_mean, # centralization **cent_metrics } return metrics
def compute_undirected_graph_metrics(G): assert type(G) is nx.Graph # degrees stats degrees = np.array([i for _, i in G.degree]) degrees_k_freq = np.unique(degrees, return_counts=True)[1] degrees_corr = numeric_attribute_correlation(G, dict(G.degree), dict(G.degree)) # clustering global_clustering = transitivity(G) local_clustering_mean = average_clustering(G) # fraction of connected node pairs (any path len) f_connected_node_pairs = fraction_of_connected_node_pairs(G) # centralization cent_metrics = centralization_metrics(G, prefix="_ud") # modularity modularity_metrics = compute_modularity_metrics(G) # largest CC CC1_nodes = max(connected_components(G), key=len) CC1 = G.subgraph(CC1_nodes).copy() f_CC1_nodes = len(CC1) / len(G) # algebraic_connectivity of the largest CC algebraic_connectivity_CC1 = None if len(CC1) > 2: try: algebraic_connectivity_CC1 = algebraic_connectivity(CC1, seed=0) except: algebraic_connectivity_CC1 = None # connected components CC = connected_components(G) CC_sizes = np.array([len(cc_i) for cc_i in CC]) CC_metrics = {} for k in CC_k_thresholds: CC_metrics[f"n_CC_{k}"] = np.sum(CC_sizes >= k) # k-core k_core_metrics = {} G_core_number = core_number(G) for k in k_core_ks: k_core_subgraph = k_core(G, k=k, core_number=G_core_number) k_core_metrics[f"core_{k}_n_nodes"] = len(k_core_subgraph.nodes) k_core_metrics[f"core_{k}_n_edges"] = len(k_core_subgraph.edges) k_core_metrics[f"core_{k}_density"] = density(k_core_subgraph) k_core_metrics[f"core_{k}_n_CC"] = len( list(connected_components(k_core_subgraph))) # k-truss k_truss_metrics = {} for k in k_truss_ks: k_truss_subgraph = k_truss(G, k=k) k_truss_metrics[f"truss_{k}_n_nodes"] = len(k_truss_subgraph.nodes) k_truss_metrics[f"truss_{k}_n_edges"] = len(k_truss_subgraph.edges) k_truss_metrics[f"truss_{k}_density"] = density(k_truss_subgraph) k_truss_metrics[f"truss_{k}_n_CC"] = len( list(connected_components(k_truss_subgraph))) metrics = { "n_edges_ud": len(G.edges()), "density_ud": density(G), # degree stats "degrees_mean": safe(np.mean, degrees), "degrees_var": safe(np.var, degrees), "degrees_hidx": safe(h_index, degrees), "degrees_gini": safe(gini, degrees + eps), "degrees_f0": safe(np.mean, (degrees == 0)), "degrees_corr": degrees_corr, "degrees_pk_ent": entropy(degrees_k_freq), "degrees_pk_gini": gini(degrees_k_freq), # fraction of connected node pairs with path of any length "f_connected_node_pairs_ud": f_connected_node_pairs, # clustering coefficients "global_clustering_ud": global_clustering, "local_clustering_mean_ud": local_clustering_mean, # centralization **cent_metrics, # modularity **modularity_metrics, # fraction of nodes in the largest CC "f_CC1_nodes": f_CC1_nodes, # algebraic connectivity of the largest CC "algebraic_connectivity_CC1": algebraic_connectivity_CC1, # connected components **CC_metrics, # k-core **k_core_metrics, # k-truss **k_truss_metrics } return metrics