def get_egonet(user): f=open('kbin', 'rb') g=open('degs', 'r') dict={} for l in g: line=l.split() dict[int(line[0])]=(int(line[1]), int(line[3])*8) g.close() f.seek(dict[user][1], 0) G=nx.DiGraph() for i in range(dict[user][0]): edge=struct.unpack('I I', f.read(8)) G.add_edge(edge[0], edge[1]) for nbr in G.neighbors(user): f.seek(dict[nbr][1], 0) for i in range(dict[nbr][0]): edge=struct.unpack('I I', f.read(8)) G.add_edge(edge[0], edge[1]) f.close() print len(G.edges()), for i in range(2,10): G.remove_edges_from(G.selfloop_edges()) G= core.k_core(G, i) print str(nx.info(G)) l= len(G.edges()) print l, print return G
def extract_with_inflexion(self, document: str) -> Sequence[Tuple[str, float]]: """Extraction of keywords corresponding to the nodes of the k-core selected from k-shell size differences Going down the k-shell while the size of k-shell keeps increasing, stop otherwise """ # Building the graph-of-words gow = self.builder.compute_gow_from_document(document) if len(gow.nodes) > 0: graph = gow.to_graph() # Computation of the k-cores if self.builder.weighted: kcore_number = core_number_weighted(graph) else: kcore_number = nx_core_number(graph) # Sorted sequence of k for each k-core descending ks = sorted({k for _, k in kcore_number.items()}, reverse=True) # Going down the k-core while k-shell size is increasing k_best = None previous = None for k1, k2 in zip(ks, ks[1:]): g_k1 = k_shell(graph, k=k1, core_number=kcore_number) g_k2 = k_shell(graph, k=k2, core_number=kcore_number) len_k1 = len(g_k1.nodes) len_k2 = len(g_k2.nodes) current = len_k2 - len_k1 if previous is not None: if (previous < 0) and (current > 0): k_best = k2 break previous = current if k_best is None: k_best = ks[0] # Retrieving the keywords for k-core with k=k_best keywords = [] best_graph = k_core(graph, k=k_best, core_number=kcore_number) for v in best_graph.nodes: token_code = best_graph.nodes[v]['label'] token = self.builder.get_token_(token_code) k = kcore_number[v] keywords.append((token, k)) return sorted(keywords, key=lambda p: p[1], reverse=True) else: return []
def extract_with_density(self, document: str) -> Sequence[Tuple[str, float]]: """Extraction of keywords corresponding to the nodes of the k-core satisfying a density criterion Density criterion consists in applying the elbow method when going down the k-core """ # Building the graph-of-words gow = self.builder.compute_gow_from_document(document) if len(gow.nodes) > 0: graph = gow.to_graph() # Computation of the k-cores if self.builder.weighted: kcore_number = core_number_weighted(graph) else: kcore_number = nx_core_number(graph) # Sorted sequence of k for each k-core ks = sorted({k for _, k in kcore_number.items()}) # Storage for (i, density) densities = [] # Mapping between i and the k-core value i_to_k = {} # Storage of k-core graph for each k k_graphs = {} # Going DOWN the k-core and computation of the k-core densities for i, k in enumerate(reversed(ks)): g_k = k_core(graph, k=k, core_number=kcore_number) k_graphs[k] = g_k i_to_k[i] = k densities.append((i, density(g_k))) # Retrieving the most appropriate density via the elbow method i_k_best = elbow(densities) # Retrieving the corresponding k k_best = i_to_k[i_k_best] # Retrieving the keywords for k-core with k=k_best keywords = [] best_graph = k_graphs[k_best] for v in best_graph.nodes: token_code = best_graph.nodes[v]['label'] token = self.builder.get_token_(token_code) k = kcore_number[v] keywords.append((token, k)) return sorted(keywords, key=lambda p: p[1], reverse=True) else: return []
def statistics(self): # y: subgraph size x: k-core sizes = [] Xsize = [] ks = range(3,35,1) G = self.G1 total = G.size() Xsize_total = len(G) for i in ks: Gk = core.k_core(G,k=i) s = Gk.size() print(len(Gk)) sizes.append(s/total) Xsize.append(len(Gk)/Xsize_total) plt.plot(ks,Xsize) plt.xlabel('k-core') plt.ylabel('(# of subgraph nodes / # of full graph nodes)') plt.show()
G.add_edge(node, adj) # Handle cliques try: signal.signal(signal.SIGALRM, percolate.clique_handler) if CCid in ['03ae', '03b0', '03b2', '03b5', '03b7', '0893']: # Skip histones and other complex OGs raise percolate.CliqueError signal.alarm(90) cliques = list(find_cliques(G)) signal.alarm(0) except percolate.CliqueError: print(f'CliqueError: {CCid}') for k in ks: subOGs = set() core = k_core(G, k) for component in connected_components(core): subOGs.add( frozenset( [frozenset(edge) for edge in core.edges(component)])) OGs_ks[k].append(subOGs) classify_CC(CCtypes_ks[k], subOGs) continue # Continue to next OG # Handle percolation for k in ks: try: signal.signal(signal.SIGALRM, percolate.percolate_handler) signal.alarm(90) subOGs = list( percolate.k_clique_communities_progressive(G, k, cliques))
def Gk_with_max_k(self,n,G,k): Gk = core.k_core(G,k=k) while n not in list(Gk): k = k-1 Gk = core.k_core(G,k=k) return Gk,k
def main(): csvfile = open(result_file, 'a') result_writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) true_k_core = {} for net_name in network_name: net = nx.read_edgelist(network_path + net_name + ".txt", create_using=nx.Graph(), nodetype=int) net.remove_edges_from(nx.selfloop_edges(net)) for k in [512, 256, 128, 64, 32, 16, 8, 4]: true_k_core[(net_name, k)] = nx_core.k_core(net, k) # result_writer.writerow([time.time(), # "node_privacy", # "true", # "k_core", # net_name, # 0.5, # 0.5, # k, # reserve for index # true_k_core # ]) for i in range(int(sys.argv[1])): print("Repeat:", i) for net_name in network_name: print("Net:", net_name) net = nx.read_edgelist(network_path + net_name + ".txt", create_using=nx.Graph(), nodetype=int) net.remove_edges_from(nx.selfloop_edges(net)) for k in [512, 256, 128, 64, 32, 16, 8, 4]: for epsilon in [0.5, 0.1, 0.05]: delta = epsilon basic_node_k_core = nx_core.k_core( basic_node.private_basic_node_sample( net, epsilon, delta), k * min(1 - math.exp(-epsilon), delta, 1.0)**2) result_writer.writerow([ time.time(), "node_privacy", "basic_node", "k_core", net_name, epsilon, delta, k, #reserve for index jaccard(true_k_core[(net_name, k)], basic_node_k_core) ]) basic_edge_k_core = nx_core.k_core( basic_edge.private_basic_edge_sample( net, epsilon, delta), k * min(1 - math.exp(-epsilon), delta, 1.0)) result_writer.writerow([ time.time(), "edge_privacy", "basic_edge", "k_core", net_name, epsilon, delta, k, jaccard(true_k_core[(net_name, k)], basic_edge_k_core) ]) color_k_core = nx_core.k_core( color.private_color_sample(net, epsilon, delta), k * min(1.0, delta)) result_writer.writerow([ time.time(), "edge_privacy", "color", "k_core", net_name, epsilon, delta, k, jaccard(true_k_core[(net_name, k)], color_k_core) ]) csvfile.flush() blocki_edge_100_k_core = nx_core.k_core( blocki_edge.blocki_edge_trim(net, 100), k) for epsilon in [0.5, 0.1, 0.05]: result_writer.writerow([ time.time(), "edge_privacy", "blocki_edge_100", "k_core", net_name, epsilon, epsilon, k, jaccard(true_k_core[(net_name, k)], blocki_edge_100_k_core) ]) blocki_edge_1000_k_core = nx_core.k_core( blocki_edge.blocki_edge_trim(net, 1000), k) for epsilon in [0.5, 0.1, 0.05]: result_writer.writerow([ time.time(), "edge_privacy", "blocki_edge_1000", "k_core", net_name, epsilon, epsilon, k, jaccard(true_k_core[(net_name, k)], blocki_edge_1000_k_core) ]) ding_26_k_core = nx_core.k_core(ding.ding_trim(net, 100), k) for epsilon in [0.5, 0.1, 0.05]: result_writer.writerow([ time.time(), "edge_privacy", "ding_100", "k_core", net_name, epsilon, epsilon, k, jaccard(true_k_core[(net_name, k)], ding_26_k_core) ]) csvfile.flush()
def process_k_core(data_dir, k): product_nodes = set() user_nodes = set() # compute k core products and users graph = nx.Graph() for category in CATEGORIES: print(category) token_length = pd.read_csv(token_length_path( data_dir, category))['token_counts'].values for i, review in enumerate(parse(raw_reviews_path(data_dir, category))): if 'reviewText' not in review: continue if len(review['reviewText'].strip()) == 0: continue if token_length[i] > 512: continue product_id = review['asin'] user_id = review['reviewerID'] if product_id not in product_nodes: graph.add_node(product_id, is_product=True) product_nodes.add(product_id) if user_id not in user_nodes: graph.add_node(user_id, is_product=False) user_nodes.add(user_id) graph.add_edge(user_id, product_id) assert token_length.size == (i + 1), f'{token_length.size}, {i}' k_core_graph = k_core(graph, k=k) k_core_nodes = set(k_core_graph.nodes) with open(user_list_path(data_dir), 'w') as f_user: with open(product_list_path(data_dir), 'w') as f_product: for node in k_core_graph.nodes: assert not (node in product_nodes and node in user_nodes) if node in product_nodes: f_product.write(f'{node}\n') elif node in user_nodes: f_user.write(f'{node}\n') # load k core products and users print('loading users and product IDs...') user_df = pd.read_csv(user_list_path(data_dir), names=['user_id']) user_ids = set(user_df['user_id']) product_df = pd.read_csv(product_list_path(data_dir), names=['product_id']) product_ids = set(product_df['product_id']) # save reviews in k-core subset with open(reviews_with_duplicates_path(data_dir), 'w') as f: field_list = [ 'reviewerID', 'asin', 'overall', 'reviewTime', 'unixReviewTime', 'reviewText', 'summary', 'verified', 'category' ] writer = csv.DictWriter(f, field_list, quoting=csv.QUOTE_NONNUMERIC) for category in CATEGORIES: print(category) token_length = pd.read_csv(token_length_path( data_dir, category))['token_counts'].values for i, review in enumerate( parse(raw_reviews_path(data_dir, category))): if 'reviewText' not in review: continue if len(review['reviewText'].strip()) == 0: continue if token_length[i] > 512: continue product_id = review['asin'] user_id = review['reviewerID'] if user_id in user_ids and product_id in product_ids: row = {} for field in field_list: if field == 'category': row[field] = category elif field in review: row[field] = review[field] else: print(f'missing {field}') row[field] = "" writer.writerow(row) # remove duplicates df = pd.read_csv(reviews_with_duplicates_path(data_dir), names=field_list, dtype={ 'reviewerID': str, 'asin': str, 'reviewTime': str, 'unixReviewTime': int, 'reviewText': str, 'summary': str, 'verified': bool, 'category': str }, keep_default_na=False, na_values=[]) df['reviewYear'] = df['reviewTime'].apply(lambda x: int(x.split(',')[-1])) df = df.drop_duplicates(['asin', 'reviewerID', 'overall', 'reviewTime']) df.to_csv(reviews_path(data_dir), index=False, quoting=csv.QUOTE_NONNUMERIC)
def compute_undirected_graph_metrics(G): assert type(G) is nx.Graph # degrees stats degrees = np.array([i for _, i in G.degree]) degrees_k_freq = np.unique(degrees, return_counts=True)[1] degrees_corr = numeric_attribute_correlation(G, dict(G.degree), dict(G.degree)) # clustering global_clustering = transitivity(G) local_clustering_mean = average_clustering(G) # fraction of connected node pairs (any path len) f_connected_node_pairs = fraction_of_connected_node_pairs(G) # centralization cent_metrics = centralization_metrics(G, prefix="_ud") # modularity modularity_metrics = compute_modularity_metrics(G) # largest CC CC1_nodes = max(connected_components(G), key=len) CC1 = G.subgraph(CC1_nodes).copy() f_CC1_nodes = len(CC1) / len(G) # algebraic_connectivity of the largest CC algebraic_connectivity_CC1 = None if len(CC1) > 2: try: algebraic_connectivity_CC1 = algebraic_connectivity(CC1, seed=0) except: algebraic_connectivity_CC1 = None # connected components CC = connected_components(G) CC_sizes = np.array([len(cc_i) for cc_i in CC]) CC_metrics = {} for k in CC_k_thresholds: CC_metrics[f"n_CC_{k}"] = np.sum(CC_sizes >= k) # k-core k_core_metrics = {} G_core_number = core_number(G) for k in k_core_ks: k_core_subgraph = k_core(G, k=k, core_number=G_core_number) k_core_metrics[f"core_{k}_n_nodes"] = len(k_core_subgraph.nodes) k_core_metrics[f"core_{k}_n_edges"] = len(k_core_subgraph.edges) k_core_metrics[f"core_{k}_density"] = density(k_core_subgraph) k_core_metrics[f"core_{k}_n_CC"] = len( list(connected_components(k_core_subgraph))) # k-truss k_truss_metrics = {} for k in k_truss_ks: k_truss_subgraph = k_truss(G, k=k) k_truss_metrics[f"truss_{k}_n_nodes"] = len(k_truss_subgraph.nodes) k_truss_metrics[f"truss_{k}_n_edges"] = len(k_truss_subgraph.edges) k_truss_metrics[f"truss_{k}_density"] = density(k_truss_subgraph) k_truss_metrics[f"truss_{k}_n_CC"] = len( list(connected_components(k_truss_subgraph))) metrics = { "n_edges_ud": len(G.edges()), "density_ud": density(G), # degree stats "degrees_mean": safe(np.mean, degrees), "degrees_var": safe(np.var, degrees), "degrees_hidx": safe(h_index, degrees), "degrees_gini": safe(gini, degrees + eps), "degrees_f0": safe(np.mean, (degrees == 0)), "degrees_corr": degrees_corr, "degrees_pk_ent": entropy(degrees_k_freq), "degrees_pk_gini": gini(degrees_k_freq), # fraction of connected node pairs with path of any length "f_connected_node_pairs_ud": f_connected_node_pairs, # clustering coefficients "global_clustering_ud": global_clustering, "local_clustering_mean_ud": local_clustering_mean, # centralization **cent_metrics, # modularity **modularity_metrics, # fraction of nodes in the largest CC "f_CC1_nodes": f_CC1_nodes, # algebraic connectivity of the largest CC "algebraic_connectivity_CC1": algebraic_connectivity_CC1, # connected components **CC_metrics, # k-core **k_core_metrics, # k-truss **k_truss_metrics } return metrics
nx.draw(ind) import community G_undirected = G.to_undirected() bp = community.best_partition(G_undirected) mod = community.modularity(bp, G_undirected, weight='weight') mod from networkx.algorithms.core import k_core G_undirected = G.to_undirected() G_undirected.remove_edges_from(nx.selfloop_edges(G_undirected)) max_kcore = k_core(G_undirected) print("Number of nodes in the max k core: " + str(max_kcore.number_of_nodes())) print("Number of edges in the max k core: " + str(max_kcore.number_of_edges())) print(max_kcore.degree(list(max_kcore.nodes())[0])) G_undirected.remove_edges_from(nx.selfloop_edges(G_undirected)) k1 = k_core(G_undirected, k=2) k2 = k_core(G_undirected, k=5) k3 = k_core(G_undirected, k=10) k4 = k_core(G_undirected, k=15) k5 = k_core(G_undirected, k=20) k6 = k_core(G_undirected, k=25) k7 = k_core(G_undirected, k=30) k8 = k_core(G_undirected, k=35)