예제 #1
0
def get_egonet(user): 
  f=open('kbin', 'rb') 
  g=open('degs', 'r') 
  dict={} 
  for l in g:
    line=l.split() 
    dict[int(line[0])]=(int(line[1]), int(line[3])*8) 
  g.close() 

  f.seek(dict[user][1], 0)
  G=nx.DiGraph() 
  for i in range(dict[user][0]): 
    edge=struct.unpack('I I', f.read(8)) 
    G.add_edge(edge[0], edge[1]) 

  for nbr in G.neighbors(user):  
    f.seek(dict[nbr][1], 0)
    for i in range(dict[nbr][0]): 
      edge=struct.unpack('I I', f.read(8)) 
      G.add_edge(edge[0], edge[1])  
  f.close()
  print len(G.edges()), 
  for i in range(2,10): 
    G.remove_edges_from(G.selfloop_edges())
    G= core.k_core(G, i) 
    print str(nx.info(G)) 
    l= len(G.edges()) 
    print l, 
  print 
  return G 
예제 #2
0
    def extract_with_inflexion(self,
                               document: str) -> Sequence[Tuple[str, float]]:
        """Extraction of keywords corresponding to the nodes of the k-core selected from k-shell size differences

        Going down the k-shell while the size of k-shell keeps increasing, stop otherwise
        """
        # Building the graph-of-words
        gow = self.builder.compute_gow_from_document(document)
        if len(gow.nodes) > 0:
            graph = gow.to_graph()

            # Computation of the k-cores
            if self.builder.weighted:
                kcore_number = core_number_weighted(graph)
            else:
                kcore_number = nx_core_number(graph)

            # Sorted sequence of k for each k-core descending
            ks = sorted({k for _, k in kcore_number.items()}, reverse=True)

            # Going down the k-core while k-shell size is increasing
            k_best = None
            previous = None
            for k1, k2 in zip(ks, ks[1:]):
                g_k1 = k_shell(graph, k=k1, core_number=kcore_number)
                g_k2 = k_shell(graph, k=k2, core_number=kcore_number)
                len_k1 = len(g_k1.nodes)
                len_k2 = len(g_k2.nodes)
                current = len_k2 - len_k1

                if previous is not None:
                    if (previous < 0) and (current > 0):
                        k_best = k2
                        break
                previous = current

            if k_best is None:
                k_best = ks[0]

            # Retrieving the keywords for k-core with k=k_best
            keywords = []
            best_graph = k_core(graph, k=k_best, core_number=kcore_number)
            for v in best_graph.nodes:
                token_code = best_graph.nodes[v]['label']
                token = self.builder.get_token_(token_code)
                k = kcore_number[v]
                keywords.append((token, k))

            return sorted(keywords, key=lambda p: p[1], reverse=True)
        else:
            return []
예제 #3
0
    def extract_with_density(self,
                             document: str) -> Sequence[Tuple[str, float]]:
        """Extraction of keywords corresponding to the nodes of the k-core satisfying a density criterion

        Density criterion consists in applying the elbow method when going down the k-core
        """
        # Building the graph-of-words
        gow = self.builder.compute_gow_from_document(document)
        if len(gow.nodes) > 0:
            graph = gow.to_graph()

            # Computation of the k-cores
            if self.builder.weighted:
                kcore_number = core_number_weighted(graph)
            else:
                kcore_number = nx_core_number(graph)

            # Sorted sequence of k for each k-core
            ks = sorted({k for _, k in kcore_number.items()})

            # Storage for (i, density)
            densities = []
            # Mapping between i and the k-core value
            i_to_k = {}
            # Storage of k-core graph for each k
            k_graphs = {}

            # Going DOWN the k-core and computation of the k-core densities
            for i, k in enumerate(reversed(ks)):
                g_k = k_core(graph, k=k, core_number=kcore_number)
                k_graphs[k] = g_k
                i_to_k[i] = k
                densities.append((i, density(g_k)))

            # Retrieving the most appropriate density via the elbow method
            i_k_best = elbow(densities)
            # Retrieving the corresponding k
            k_best = i_to_k[i_k_best]

            # Retrieving the keywords for k-core with k=k_best
            keywords = []
            best_graph = k_graphs[k_best]
            for v in best_graph.nodes:
                token_code = best_graph.nodes[v]['label']
                token = self.builder.get_token_(token_code)
                k = kcore_number[v]
                keywords.append((token, k))

            return sorted(keywords, key=lambda p: p[1], reverse=True)
        else:
            return []
예제 #4
0
 def statistics(self):
     # y: subgraph size x: k-core
     sizes = []
     Xsize = []
     ks = range(3,35,1)
     G = self.G1
     total = G.size()
     Xsize_total = len(G)
     for i in ks:
         Gk = core.k_core(G,k=i)
         s = Gk.size()
         print(len(Gk))
         sizes.append(s/total)
         Xsize.append(len(Gk)/Xsize_total)
     plt.plot(ks,Xsize)
     plt.xlabel('k-core')
     plt.ylabel('(# of subgraph nodes / # of full graph nodes)')
     plt.show()
            G.add_edge(node, adj)

    # Handle cliques
    try:
        signal.signal(signal.SIGALRM, percolate.clique_handler)
        if CCid in ['03ae', '03b0', '03b2', '03b5', '03b7',
                    '0893']:  # Skip histones and other complex OGs
            raise percolate.CliqueError
        signal.alarm(90)
        cliques = list(find_cliques(G))
        signal.alarm(0)
    except percolate.CliqueError:
        print(f'CliqueError: {CCid}')
        for k in ks:
            subOGs = set()
            core = k_core(G, k)
            for component in connected_components(core):
                subOGs.add(
                    frozenset(
                        [frozenset(edge) for edge in core.edges(component)]))
            OGs_ks[k].append(subOGs)
            classify_CC(CCtypes_ks[k], subOGs)
        continue  # Continue to next OG

    # Handle percolation
    for k in ks:
        try:
            signal.signal(signal.SIGALRM, percolate.percolate_handler)
            signal.alarm(90)
            subOGs = list(
                percolate.k_clique_communities_progressive(G, k, cliques))
예제 #6
0
 def Gk_with_max_k(self,n,G,k):
     Gk = core.k_core(G,k=k)
     while n not in list(Gk):
         k = k-1
         Gk = core.k_core(G,k=k)
     return Gk,k
예제 #7
0
def main():
    csvfile = open(result_file, 'a')
    result_writer = csv.writer(csvfile,
                               delimiter=',',
                               quotechar='|',
                               quoting=csv.QUOTE_MINIMAL)

    true_k_core = {}
    for net_name in network_name:
        net = nx.read_edgelist(network_path + net_name + ".txt",
                               create_using=nx.Graph(),
                               nodetype=int)
        net.remove_edges_from(nx.selfloop_edges(net))
        for k in [512, 256, 128, 64, 32, 16, 8, 4]:
            true_k_core[(net_name, k)] = nx_core.k_core(net, k)
            # result_writer.writerow([time.time(),
            #                         "node_privacy",
            #                         "true",
            #                         "k_core",
            #                         net_name,
            #                         0.5,
            #                         0.5,
            #                         k,  # reserve for index
            #                         true_k_core
            # ])

    for i in range(int(sys.argv[1])):
        print("Repeat:", i)
        for net_name in network_name:
            print("Net:", net_name)
            net = nx.read_edgelist(network_path + net_name + ".txt",
                                   create_using=nx.Graph(),
                                   nodetype=int)

            net.remove_edges_from(nx.selfloop_edges(net))
            for k in [512, 256, 128, 64, 32, 16, 8, 4]:
                for epsilon in [0.5, 0.1, 0.05]:
                    delta = epsilon
                    basic_node_k_core = nx_core.k_core(
                        basic_node.private_basic_node_sample(
                            net, epsilon, delta),
                        k * min(1 - math.exp(-epsilon), delta, 1.0)**2)
                    result_writer.writerow([
                        time.time(),
                        "node_privacy",
                        "basic_node",
                        "k_core",
                        net_name,
                        epsilon,
                        delta,
                        k,  #reserve for index
                        jaccard(true_k_core[(net_name, k)], basic_node_k_core)
                    ])

                    basic_edge_k_core = nx_core.k_core(
                        basic_edge.private_basic_edge_sample(
                            net, epsilon, delta),
                        k * min(1 - math.exp(-epsilon), delta, 1.0))
                    result_writer.writerow([
                        time.time(), "edge_privacy", "basic_edge", "k_core",
                        net_name, epsilon, delta, k,
                        jaccard(true_k_core[(net_name, k)], basic_edge_k_core)
                    ])

                    color_k_core = nx_core.k_core(
                        color.private_color_sample(net, epsilon, delta),
                        k * min(1.0, delta))
                    result_writer.writerow([
                        time.time(), "edge_privacy", "color", "k_core",
                        net_name, epsilon, delta, k,
                        jaccard(true_k_core[(net_name, k)], color_k_core)
                    ])

                    csvfile.flush()

                blocki_edge_100_k_core = nx_core.k_core(
                    blocki_edge.blocki_edge_trim(net, 100), k)
                for epsilon in [0.5, 0.1, 0.05]:
                    result_writer.writerow([
                        time.time(), "edge_privacy", "blocki_edge_100",
                        "k_core", net_name, epsilon, epsilon, k,
                        jaccard(true_k_core[(net_name, k)],
                                blocki_edge_100_k_core)
                    ])

                blocki_edge_1000_k_core = nx_core.k_core(
                    blocki_edge.blocki_edge_trim(net, 1000), k)
                for epsilon in [0.5, 0.1, 0.05]:
                    result_writer.writerow([
                        time.time(), "edge_privacy", "blocki_edge_1000",
                        "k_core", net_name, epsilon, epsilon, k,
                        jaccard(true_k_core[(net_name, k)],
                                blocki_edge_1000_k_core)
                    ])

                ding_26_k_core = nx_core.k_core(ding.ding_trim(net, 100), k)
                for epsilon in [0.5, 0.1, 0.05]:
                    result_writer.writerow([
                        time.time(), "edge_privacy", "ding_100", "k_core",
                        net_name, epsilon, epsilon, k,
                        jaccard(true_k_core[(net_name, k)], ding_26_k_core)
                    ])

                csvfile.flush()
예제 #8
0
def process_k_core(data_dir, k):
    product_nodes = set()
    user_nodes = set()
    # compute k core products and users
    graph = nx.Graph()
    for category in CATEGORIES:
        print(category)
        token_length = pd.read_csv(token_length_path(
            data_dir, category))['token_counts'].values
        for i, review in enumerate(parse(raw_reviews_path(data_dir,
                                                          category))):
            if 'reviewText' not in review:
                continue
            if len(review['reviewText'].strip()) == 0:
                continue
            if token_length[i] > 512:
                continue
            product_id = review['asin']
            user_id = review['reviewerID']
            if product_id not in product_nodes:
                graph.add_node(product_id, is_product=True)
                product_nodes.add(product_id)
            if user_id not in user_nodes:
                graph.add_node(user_id, is_product=False)
                user_nodes.add(user_id)
            graph.add_edge(user_id, product_id)
        assert token_length.size == (i + 1), f'{token_length.size}, {i}'

    k_core_graph = k_core(graph, k=k)
    k_core_nodes = set(k_core_graph.nodes)
    with open(user_list_path(data_dir), 'w') as f_user:
        with open(product_list_path(data_dir), 'w') as f_product:
            for node in k_core_graph.nodes:
                assert not (node in product_nodes and node in user_nodes)
                if node in product_nodes:
                    f_product.write(f'{node}\n')
                elif node in user_nodes:
                    f_user.write(f'{node}\n')
    # load k core products and users
    print('loading users and product IDs...')
    user_df = pd.read_csv(user_list_path(data_dir), names=['user_id'])
    user_ids = set(user_df['user_id'])
    product_df = pd.read_csv(product_list_path(data_dir), names=['product_id'])
    product_ids = set(product_df['product_id'])
    # save reviews in k-core subset
    with open(reviews_with_duplicates_path(data_dir), 'w') as f:
        field_list = [
            'reviewerID', 'asin', 'overall', 'reviewTime', 'unixReviewTime',
            'reviewText', 'summary', 'verified', 'category'
        ]
        writer = csv.DictWriter(f, field_list, quoting=csv.QUOTE_NONNUMERIC)
        for category in CATEGORIES:
            print(category)
            token_length = pd.read_csv(token_length_path(
                data_dir, category))['token_counts'].values
            for i, review in enumerate(
                    parse(raw_reviews_path(data_dir, category))):
                if 'reviewText' not in review:
                    continue
                if len(review['reviewText'].strip()) == 0:
                    continue
                if token_length[i] > 512:
                    continue
                product_id = review['asin']
                user_id = review['reviewerID']
                if user_id in user_ids and product_id in product_ids:
                    row = {}
                    for field in field_list:
                        if field == 'category':
                            row[field] = category
                        elif field in review:
                            row[field] = review[field]
                        else:
                            print(f'missing {field}')
                            row[field] = ""
                    writer.writerow(row)
    # remove duplicates
    df = pd.read_csv(reviews_with_duplicates_path(data_dir),
                     names=field_list,
                     dtype={
                         'reviewerID': str,
                         'asin': str,
                         'reviewTime': str,
                         'unixReviewTime': int,
                         'reviewText': str,
                         'summary': str,
                         'verified': bool,
                         'category': str
                     },
                     keep_default_na=False,
                     na_values=[])
    df['reviewYear'] = df['reviewTime'].apply(lambda x: int(x.split(',')[-1]))
    df = df.drop_duplicates(['asin', 'reviewerID', 'overall', 'reviewTime'])
    df.to_csv(reviews_path(data_dir),
              index=False,
              quoting=csv.QUOTE_NONNUMERIC)
def compute_undirected_graph_metrics(G):
    assert type(G) is nx.Graph

    # degrees stats
    degrees = np.array([i for _, i in G.degree])
    degrees_k_freq = np.unique(degrees, return_counts=True)[1]
    degrees_corr = numeric_attribute_correlation(G, dict(G.degree),
                                                 dict(G.degree))

    # clustering
    global_clustering = transitivity(G)
    local_clustering_mean = average_clustering(G)

    # fraction of connected node pairs (any path len)
    f_connected_node_pairs = fraction_of_connected_node_pairs(G)

    # centralization
    cent_metrics = centralization_metrics(G, prefix="_ud")

    # modularity
    modularity_metrics = compute_modularity_metrics(G)

    # largest CC
    CC1_nodes = max(connected_components(G), key=len)
    CC1 = G.subgraph(CC1_nodes).copy()
    f_CC1_nodes = len(CC1) / len(G)

    # algebraic_connectivity of the largest CC
    algebraic_connectivity_CC1 = None
    if len(CC1) > 2:
        try:
            algebraic_connectivity_CC1 = algebraic_connectivity(CC1, seed=0)
        except:
            algebraic_connectivity_CC1 = None

    # connected components
    CC = connected_components(G)
    CC_sizes = np.array([len(cc_i) for cc_i in CC])

    CC_metrics = {}
    for k in CC_k_thresholds:
        CC_metrics[f"n_CC_{k}"] = np.sum(CC_sizes >= k)

    # k-core
    k_core_metrics = {}
    G_core_number = core_number(G)

    for k in k_core_ks:
        k_core_subgraph = k_core(G, k=k, core_number=G_core_number)
        k_core_metrics[f"core_{k}_n_nodes"] = len(k_core_subgraph.nodes)
        k_core_metrics[f"core_{k}_n_edges"] = len(k_core_subgraph.edges)
        k_core_metrics[f"core_{k}_density"] = density(k_core_subgraph)
        k_core_metrics[f"core_{k}_n_CC"] = len(
            list(connected_components(k_core_subgraph)))

    # k-truss
    k_truss_metrics = {}

    for k in k_truss_ks:
        k_truss_subgraph = k_truss(G, k=k)
        k_truss_metrics[f"truss_{k}_n_nodes"] = len(k_truss_subgraph.nodes)
        k_truss_metrics[f"truss_{k}_n_edges"] = len(k_truss_subgraph.edges)
        k_truss_metrics[f"truss_{k}_density"] = density(k_truss_subgraph)
        k_truss_metrics[f"truss_{k}_n_CC"] = len(
            list(connected_components(k_truss_subgraph)))

    metrics = {
        "n_edges_ud":
        len(G.edges()),
        "density_ud":
        density(G),
        # degree stats
        "degrees_mean":
        safe(np.mean, degrees),
        "degrees_var":
        safe(np.var, degrees),
        "degrees_hidx":
        safe(h_index, degrees),
        "degrees_gini":
        safe(gini, degrees + eps),
        "degrees_f0":
        safe(np.mean, (degrees == 0)),
        "degrees_corr":
        degrees_corr,
        "degrees_pk_ent":
        entropy(degrees_k_freq),
        "degrees_pk_gini":
        gini(degrees_k_freq),
        # fraction of connected node pairs with path of any length
        "f_connected_node_pairs_ud":
        f_connected_node_pairs,
        # clustering coefficients
        "global_clustering_ud":
        global_clustering,
        "local_clustering_mean_ud":
        local_clustering_mean,
        # centralization
        **cent_metrics,
        # modularity
        **modularity_metrics,
        # fraction of nodes in the largest CC
        "f_CC1_nodes":
        f_CC1_nodes,
        # algebraic connectivity of the largest CC
        "algebraic_connectivity_CC1":
        algebraic_connectivity_CC1,
        # connected components
        **CC_metrics,
        # k-core
        **k_core_metrics,
        # k-truss
        **k_truss_metrics
    }

    return metrics
예제 #10
0
nx.draw(ind)

import community
G_undirected = G.to_undirected()
bp = community.best_partition(G_undirected)

mod = community.modularity(bp, G_undirected, weight='weight')

mod

from networkx.algorithms.core import k_core

G_undirected = G.to_undirected()
G_undirected.remove_edges_from(nx.selfloop_edges(G_undirected))
max_kcore = k_core(G_undirected)

print("Number of nodes in the max k core: " + str(max_kcore.number_of_nodes()))
print("Number of edges in the max k core: " + str(max_kcore.number_of_edges()))
print(max_kcore.degree(list(max_kcore.nodes())[0]))

G_undirected.remove_edges_from(nx.selfloop_edges(G_undirected))

k1 = k_core(G_undirected, k=2)
k2 = k_core(G_undirected, k=5)
k3 = k_core(G_undirected, k=10)
k4 = k_core(G_undirected, k=15)
k5 = k_core(G_undirected, k=20)
k6 = k_core(G_undirected, k=25)
k7 = k_core(G_undirected, k=30)
k8 = k_core(G_undirected, k=35)