示例#1
0
    def test_non_edges(self):
        # All possible edges exist
        graph = nx.complete_graph(5)
        nedges = list(nx.non_edges(graph))
        assert_equal(len(nedges), 0)

        graph = nx.path_graph(4)
        expected = [(0, 2), (0, 3), (1, 3)]
        nedges = list(nx.non_edges(graph))
        for (u, v) in expected:
            assert_true((u, v) in nedges or (v, u) in nedges)

        graph = nx.star_graph(4)
        expected = [(1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
        nedges = list(nx.non_edges(graph))
        for (u, v) in expected:
            assert_true((u, v) in nedges or (v, u) in nedges)

        # Directed graphs
        graph = nx.DiGraph()
        graph.add_edges_from([(0, 2), (2, 0), (2, 1)])
        expected = [(0, 1), (1, 0), (1, 2)]
        nedges = list(nx.non_edges(graph))
        for e in expected:
            assert_true(e in nedges)
示例#2
0
 def __init__(self,infile, readFileLite = False):
     # create a new graph
     G = nx.DiGraph()
     
     self.allParts = []
     self.partsManager = partsManager()
     self.infile = infile
     if(self.infile):
         self.readFile(self.infile,readFileLite);
     
     # edge to feature list 
     
     
     # add all nodes and edges
     G.add_nodes_from(range(self.n+1)) 
     G.add_edges_from(nx.non_edges(G))
     
     # remove incoming edges to 0
     for node in range(1,self.n+1):
         G.remove_edge(node, 0) 
     
     # remove pruned edges
     G = self.removePrunedEdges(G);
     
     # save graph
     self.graph = G
def jaccard_predictions(G):
    """
    Create a ranked list of possible new links based on the Jaccard similarity,
    defined
     as the intersection of nodes divided by the union of nodes
    
    parameters
    G: Directed or undirected nx graph
    returns
    list of linkbunches with the score as an attribute
    """
    potential_edges = []
    G_undirected = nx.Graph(G)
    for non_edge in nx.non_edges(G_undirected):
        u = set(G.neighbors(non_edge[0]))
        v = set(G.neighbors(non_edge[1]))
        uv_un = len(u.union(v))
        uv_int = len(u.intersection(v))
        if uv_int == 0 or uv_un == 0:
            continue
        else:
            s = (1.0*uv_int)/uv_un
            
        potential_edges.append(non_edge + ({'score': s},))
        
    return potential_edges
示例#4
0
def fuzz_network(G_orig, threshold, b, edge_frac=1.0, nonedge_mult=5.0):
    G = G_orig.copy()
    n = len(G.nodes())
    H = Graph()
    H.add_nodes_from(range(n))
    pairs = n * (n - 1) / 2
    actual_edges = len(G.edges())
    edges = int(edge_frac * actual_edges)
    nonedges = int(edges * nonedge_mult)

    a = b / nonedge_mult

    # though these distributions are normalized to one, by selecting the appropriate number of edges
    # and nonedges, we make these 'distributions' correct
    edge_probs = np.random.beta(a + 1, b, edges)
    nonedge_probs = np.random.beta(a, b + 1, nonedges)

    # picking the right number of edges from the appropriate list
    edge_list = G.edges()
    nonedge_list = list(non_edges(G))
    shuffle(edge_list)
    shuffle(nonedge_list)
    for i in range(len(edge_probs)):
        G[edge_list[i][0]][edge_list[i][1]]["weight"] = edge_probs[i]
        if edge_probs[i] > threshold:
            H.add_edge(edge_list[i][0], edge_list[i][1])
    for i in range(len(nonedge_probs)):
        G.add_edge(nonedge_list[i][0], nonedge_list[i][1], weight=nonedge_probs[i])
        if nonedge_probs[i] > threshold:
            H.add_edge(nonedge_list[i][0], nonedge_list[i][1])

    return G, H
示例#5
0
def add_remove_random_edges(G, pct_add, pct_remove):
    """Randomly add edges to and remove edges from G

    Parameters
    ----------
    G : a networkx.Graph
        the network

    pct_add : float
        A percentage (between 0 and 1)

    pct_remove : float
        A percentage (between 0 and 1)
    """
    assert_is_percentage(pct_add)
    assert_is_percentage(pct_remove)
    edges = G.edges()
    m = len(edges)
    to_add = int(m * pct_add)
    to_remove = int(m * pct_remove)
    log.debug("Will add %d (%f) edges to and remove %d (%f) edges of %d",
              to_add, pct_add, to_remove, pct_remove, m)

    new_edges = set(nx.non_edges(G))
    G.remove_edges_from(random.sample(edges, to_remove))
    G.add_edges_from(random.sample(new_edges, to_add))
def show_graph(g, vertex_color='typeof', size=15, vertex_label=None):
    """show_graph."""
    degrees = [len(g.neighbors(u)) for u in g.nodes()]

    print(('num nodes=%d' % len(g)))
    print(('num edges=%d' % len(g.edges())))
    print(('num non edges=%d' % len(list(nx.non_edges(g)))))
    print(('max degree=%d' % max(degrees)))
    print(('median degree=%d' % np.percentile(degrees, 50)))

    draw_graph(g, size=size,
               vertex_color=vertex_color, vertex_label=vertex_label,
               vertex_size=200, edge_label=None)

    # display degree distribution
    size = int((max(degrees) - min(degrees)) / 1.5)
    plt.figure(figsize=(size, 3))
    plt.title('Degree distribution')
    _bins = np.arange(min(degrees), max(degrees) + 2) - .5
    n, bins, patches = plt.hist(degrees, _bins,
                                alpha=0.3,
                                facecolor='navy', histtype='bar',
                                rwidth=0.8, edgecolor='k')
    labels = np.array([str(int(i)) for i in n])
    for xi, yi, label in zip(bins, n, labels):
        plt.text(xi + 0.5, yi, label, ha='center', va='bottom')

    plt.xticks(bins + 0.5)
    plt.xlim((min(degrees) - 1, max(degrees) + 1))
    plt.ylim((0, max(n) * 1.1))
    plt.xlabel('Node degree')
    plt.ylabel('Counts')
    plt.grid(linestyle=":")
    plt.show()
def adamic_adar_index(G, ebunch=None):
    if ebunch is None:
        ebunch = nx.non_edges(G)
    def predict(u, v):
        return sum(1 / math.log(G.degree(w))
                   for w in nx.common_neighbors(G, u, v))
    return ((u, v, predict(u, v)) for u, v in ebunch)
def common_neighbor(G, ebunch=None):
    if ebunch is None:
        ebunch = nx.non_edges(G)
    def predict(u, v):
        cnbors = list(nx.common_neighbors(G, u, v))
        return len(cnbors)
    return ((u, v, predict(u, v)) for u, v in ebunch)
示例#9
0
def get_unknown_edges(_G):
    _unknown = list()
    _edges = nx.non_edges(_G)
    for e in _edges:
        _unknown.append(e)

    return _unknown
示例#10
0
def make_train_test_set(graph, radius,
                        test_proportion=.3, ratio_neg_to_pos=10):
    """make_train_test_set."""
    pos = [(u, v) for u, v in graph.edges()]
    neg = [(u, v) for u, v in nx.non_edges(graph)]
    random.shuffle(pos)
    random.shuffle(neg)
    pos_dim = len(pos)
    neg_dim = len(neg)
    max_n_neg = min(pos_dim * ratio_neg_to_pos, neg_dim)
    neg = neg[:max_n_neg]
    neg_dim = len(neg)
    tr_pos = pos[:-int(pos_dim * test_proportion)]
    te_pos = pos[-int(pos_dim * test_proportion):]
    tr_neg = neg[:-int(neg_dim * test_proportion)]
    te_neg = neg[-int(neg_dim * test_proportion):]

    # remove edges
    tr_graph = graph.copy()
    tr_graph.remove_edges_from(te_pos)
    tr_pos_graphs = list(_make_subgraph_set(tr_graph, radius, tr_pos))
    tr_neg_graphs = list(_make_subgraph_set(tr_graph, radius, tr_neg))
    te_pos_graphs = list(_make_subgraph_set(tr_graph, radius, te_pos))
    te_neg_graphs = list(_make_subgraph_set(tr_graph, radius, te_neg))

    tr_graphs = tr_pos_graphs + tr_neg_graphs
    te_graphs = te_pos_graphs + te_neg_graphs
    tr_targets = [1] * len(tr_pos_graphs) + [0] * len(tr_neg_graphs)

    te_targets = [1] * len(te_pos_graphs) + [0] * len(te_neg_graphs)
    tr_graphs, tr_targets = paired_shuffle(tr_graphs, tr_targets)
    te_graphs, te_targets = paired_shuffle(te_graphs, te_targets)

    return (tr_graphs, np.array(tr_targets)), (te_graphs, np.array(te_targets))
示例#11
0
def common_neighbors(G, fn, t = 0.5):
    G = G.to_undirected()
    if os.path.isfile(fn) :
        H = G.copy()
        found = nx.read_edgelist(fn, nodetype=int, data=False)
        H.add_edges_from(found.edges_iter())
        jacc_iter = nx.jaccard_coefficient(G, nx.non_edges(H))
        print "Appending to %s" % fn
        outfile = open(fn,'a',1)
        i = found.number_of_nodes()
    else:
        jacc_iter = nx.jaccard_coefficient(G)
        outfile = open(fn,'w',1)
        i = 0
    outfile.write("#vertex u; vertex v; their jaccard coef\n")
    cur = -1
    print "Starting jacc loop %s with threshold %s" % (time.strftime("%H:%M:%S"), t)
    for pair in jacc_iter:
        if pair[2] >= t:
            outfile.write("%s %s %f\n" % (pair[0],pair[1],pair[2]))
            if pair[0] != cur:
                cur = pair[0]
                i += 1
                print "%s: %s" % (i, cur)
    outfile.close()
    print "Done writing %s" % (fn)
示例#12
0
def jaccard_coefficient(G, ebunch=None):
    r"""Compute the Jaccard coefficient of all node pairs in ebunch.

    Jaccard coefficient of nodes `u` and `v` is defined as

    .. math::

        \frac{|\Gamma(u) \cap \Gamma(v)|}{|\Gamma(u) \cup \Gamma(v)|}

    where :math:`\Gamma(u)` denotes the set of neighbors of `u`.

    Parameters
    ----------
    G : graph
        A NetworkX undirected graph.

    ebunch : iterable of node pairs, optional (default = None)
        Jaccard coefficient will be computed for each pair of nodes
        given in the iterable. The pairs must be given as 2-tuples
        (u, v) where u and v are nodes in the graph. If ebunch is None
        then all non-existent edges in the graph will be used.
        Default value: None.

    Returns
    -------
    piter : iterator
        An iterator of 3-tuples in the form (u, v, p) where (u, v) is a
        pair of nodes and p is their Jaccard coefficient.

    Examples
    --------
    >>> import networkx as nx
    >>> G = nx.complete_graph(5)
    >>> preds = nx.jaccard_coefficient(G, [(0, 1), (2, 3)])
    >>> for u, v, p in preds:
    ...     '(%d, %d) -> %.8f' % (u, v, p)
    ...
    '(0, 1) -> 0.60000000'
    '(2, 3) -> 0.60000000'

    References
    ----------
    .. [1] D. Liben-Nowell, J. Kleinberg.
           The Link Prediction Problem for Social Networks (2004).
           http://www.cs.cornell.edu/home/kleinber/link-pred.pdf
    """
    if ebunch is None:
        ebunch = nx.non_edges(G)

    def predict(u, v):
        cnbors = list(nx.common_neighbors(G, u, v))
        union_size = len(set(G[u]) | set(G[v]))
        if union_size == 0:
            return 0
        else:
            return len(cnbors) / union_size

    return ((u, v, predict(u, v)) for u, v in ebunch)
def graph_distance(G, ebunch=None):
    if ebunch is None:
        ebunch = nx.non_edges(G)
    def predict(u, v):
        if(nx.has_path(G, u, v)):
            s_path_length = nx.shortest_path_length(G, source = u, target = v)
            return (-1) * s_path_length
        else:
            return -100
    return ((u, v, predict(u, v)) for u, v in ebunch)
示例#14
0
def resource_allocation_index(G, ebunch=None):
    r"""Compute the resource allocation index of all node pairs in ebunch.

    Resource allocation index of `u` and `v` is defined as

    .. math::

        \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{|\Gamma(w)|}

    where :math:`\Gamma(u)` denotes the set of neighbors of `u`.

    Parameters
    ----------
    G : graph
        A NetworkX undirected graph.

    ebunch : iterable of node pairs, optional (default = None)
        Resource allocation index will be computed for each pair of
        nodes given in the iterable. The pairs must be given as
        2-tuples (u, v) where u and v are nodes in the graph. If ebunch
        is None then all non-existent edges in the graph will be used.
        Default value: None.

    Returns
    -------
    piter : iterator
        An iterator of 3-tuples in the form (u, v, p) where (u, v) is a
        pair of nodes and p is their resource allocation index.

    Examples
    --------
    >>> import networkx as nx
    >>> G = nx.complete_graph(5)
    >>> preds = nx.resource_allocation_index(G, [(0, 1), (2, 3)])
    >>> for u, v, p in preds:
    ...     '(%d, %d) -> %.8f' % (u, v, p)
    ...
    '(0, 1) -> 0.75000000'
    '(2, 3) -> 0.75000000'

    References
    ----------
    .. [1] T. Zhou, L. Lu, Y.-C. Zhang.
       Predicting missing links via local information.
       Eur. Phys. J. B 71 (2009) 623.
       http://arxiv.org/pdf/0901.0553.pdf
    """
    if ebunch is None:
        ebunch = nx.non_edges(G)

    def predict(u, v):
        return sum(1 / G.degree(w) for w in nx.common_neighbors(G, u, v))

    return ((u, v, predict(u, v)) for u, v in ebunch)
def cosine_similarity(G, ebunch=None):
    if ebunch is None:
        ebunch = nx.non_edges(G)
    def predict(u, v):
        cnbors = list(nx.common_neighbors(G, u, v))
        cosine_val = math.sqrt(G.degree(u) * G.degree(v))
        if cosine_val == 0:
            return 0
        else:
            return len(cnbors) / cosine_val
    return ((u, v, predict(u, v)) for u, v in ebunch)
示例#16
0
def adamic_adar_index(G, ebunch=None):
    r"""Compute the Adamic-Adar index of all node pairs in ebunch.

    Adamic-Adar index of `u` and `v` is defined as

    .. math::

        \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{\log |\Gamma(w)|}

    where :math:`\Gamma(u)` denotes the set of neighbors of `u`.

    Parameters
    ----------
    G : graph
        NetworkX undirected graph.

    ebunch : iterable of node pairs, optional (default = None)
        Adamic-Adar index will be computed for each pair of nodes given
        in the iterable. The pairs must be given as 2-tuples (u, v)
        where u and v are nodes in the graph. If ebunch is None then all
        non-existent edges in the graph will be used.
        Default value: None.

    Returns
    -------
    piter : iterator
        An iterator of 3-tuples in the form (u, v, p) where (u, v) is a
        pair of nodes and p is their Adamic-Adar index.

    Examples
    --------
    >>> import networkx as nx
    >>> G = nx.complete_graph(5)
    >>> preds = nx.adamic_adar_index(G, [(0, 1), (2, 3)])
    >>> for u, v, p in preds:
    ...     '(%d, %d) -> %.8f' % (u, v, p)
    ...
    '(0, 1) -> 2.16404256'
    '(2, 3) -> 2.16404256'

    References
    ----------
    .. [1] D. Liben-Nowell, J. Kleinberg.
           The Link Prediction Problem for Social Networks (2004).
           http://www.cs.cornell.edu/home/kleinber/link-pred.pdf
    """
    if ebunch is None:
        ebunch = nx.non_edges(G)

    def predict(u, v):
        return sum(1 / math.log(G.degree(w))
                   for w in nx.common_neighbors(G, u, v))

    return ((u, v, predict(u, v)) for u, v in ebunch)
def lhn(G, ebunch=None):
    if ebunch is None:
        ebunch = nx.non_edges(G)
    def predict(u, v):
        cnbors = list(nx.common_neighbors(G, u, v))
        mult_val = G.degree(u) * G.degree(v)
        if mult_val == 0:
            return 0
        else:
            return len(cnbors)/ mult_val
    return ((u, v, predict(u, v)) for u, v in ebunch)
def hdi(G, ebunch=None):
    if ebunch is None:
        ebunch = nx.non_edges(G)
    def predict(u, v):
        cnbors = list(nx.common_neighbors(G, u, v))
        max_val = max(G.degree(u), G.degree(v))
        if max_val == 0:
            return 0
        else:
            return len(cnbors) / max_val
    return ((u, v, predict(u, v)) for u, v in ebunch)
def sorensen(G, ebunch=None):
    if ebunch is None:
        ebunch = nx.non_edges(G)
    def predict(u, v):
        cnbors_len = len(list(nx.common_neighbors(G, u, v)))
        denomi = G.degree(u) + G.degree(v)
        if denomi == 0:
            return 0
        else:
            return (2*cnbors_len) / denomi
    return ((u, v, predict(u, v)) for u, v in ebunch)
def jaccard_coefficient(G, ebunch=None):
    if ebunch is None:
        ebunch = nx.non_edges(G)
    def predict(u, v):
        cnbors = list(nx.common_neighbors(G, u, v))
        union_size = len(set(G[u]) | set(G[v]))
        if union_size == 0:
            return 0
        else:
            return len(cnbors) / union_size
    return ((u, v, predict(u, v)) for u, v in ebunch)
def resource_allocation_index(G, ebunch=None):
    if ebunch is None:
        ebunch = nx.non_edges(G)
    def predict(u, v):
        cnbors = list(nx.common_neighbors(G, u, v))
        sum_cn = 0
        for w in cnbors:
            if not G.degree(w) == 0:
                #print("debug")
                sum_cn += 1/math.fabs(G.degree(w))
        return sum_cn
    return ((u, v, predict(u, v)) for u, v in ebunch)
示例#22
0
 def excluded(self):
     """Get set of links that should not be predicted"""
     exclude = self.config['exclude']
     if not exclude:
         return set()  # No nodes are excluded
     elif exclude == 'old':
         return set(self.training.edges_iter())
     elif exclude == 'new':
         return set(nx.non_edges(self.training))
     raise LinkPredError("Value '{}' for exclude is unexpected. Use either "
                         "'old', 'new' or empty string '' (for no "
                         "exclusions)".format(exclude))
示例#23
0
def jaccard_mp_predictions(G):
    """
    Create a ranked list of possible new links based on the Jaccard similarity,
    defined as the intersection of nodes divided by the union of nodes
    
    parameters
    G: Directed or undirected nx graph
    returns
    list of linkbunches with the score as an attribute
    """
    pool = mp.Pool(processes=4)
    G_undirected = nx.Graph(G)
    results = pool.map(jaccard_prediction, nx.non_edges(G_undirected))
    return results
示例#24
0
def preferential_attachment(G, ebunch=None):
    r"""Compute the preferential attachment score of all node pairs in ebunch.

    Preferential attachment score of `u` and `v` is defined as

    .. math::

        |\Gamma(u)| |\Gamma(v)|

    where :math:`\Gamma(u)` denotes the set of neighbors of `u`.

    Parameters
    ----------
    G : graph
        NetworkX undirected graph.

    ebunch : iterable of node pairs, optional (default = None)
        Preferential attachment score will be computed for each pair of
        nodes given in the iterable. The pairs must be given as
        2-tuples (u, v) where u and v are nodes in the graph. If ebunch
        is None then all non-existent edges in the graph will be used.
        Default value: None.

    Returns
    -------
    piter : iterator
        An iterator of 3-tuples in the form (u, v, p) where (u, v) is a
        pair of nodes and p is their preferential attachment score.

    Examples
    --------
    >>> import networkx as nx
    >>> G = nx.complete_graph(5)
    >>> preds = nx.preferential_attachment(G, [(0, 1), (2, 3)])
    >>> for u, v, p in preds:
    ...     '(%d, %d) -> %d' % (u, v, p)
    ...
    '(0, 1) -> 16'
    '(2, 3) -> 16'

    References
    ----------
    .. [1] D. Liben-Nowell, J. Kleinberg.
           The Link Prediction Problem for Social Networks (2004).
           http://www.cs.cornell.edu/home/kleinber/link-pred.pdf
    """
    if ebunch is None:
        ebunch = nx.non_edges(G)

    return ((u, v, G.degree(u) * G.degree(v)) for u, v in ebunch)
示例#25
0
def randomAnony(g, k, *li):
    """Delete and add k nodes from g"""
    import random

    if g.number_of_edges() >= k:
        delEdges = random.sample(g.edges(), k)
        outStr = "Randomly delete " + str(k) + " edges:" + "\n" + str(delEdges) + "\n"
    g.remove_edges_from(delEdges)
    noEdges = list(nx.non_edges(g))  # This is an inefficient methond!!!
    if len(noEdges) > k:
        addEdges = random.sample(noEdges, k)
        g.add_edges_from(addEdges)
        outStr = outStr + "Randomly add " + str(k) + " edges:" + "\n" + str(addEdges) + "\n"
    if li:  # Display the del/add edges on TxtCtr
        sc = li[0]
        sc.SetValue(outStr)
示例#26
0
def Prediction_Experiment(G, Predictor, Probe_Set, Top_L, Deleted_Ratio):
    print "Prediction_Experiment!"
    #Get Evaluation Link Set--------
    #Top_L = (G.number_of_edges() - 0) / Top_k #The top proportion 1/Top_k of edges are considered
    #Probe_Set = Probe_Set_Correspond_Training(G, Top_L, fpname)  #****Get the probe set for evaluation*****
    #Get Ranking List with different deleted links ratio----------
    Edge_Num = float(G.number_of_edges())

    '''AUC = Performance_Evaluation_AUC(Predictor, G, Probe_Set)'''
    Unobserved_links = nx.non_edges(G)
    Non_existing_links = list(set(Unobserved_links).difference(set(Probe_Set)))
    AUC = Performance_Evaluation_AUC(Predictor, G, Probe_Set, Non_existing_links)

    Rank_List_Set = Prediction_LinkScores_Ratio(G, Predictor, Deleted_Ratio, 50, 30) #Prediction_LinkScores_Ratio(G, Predictor, Proportion, Toleration, Predict_Gap)
    #----Performance Evaluation with Precision under different Training Data Ratio----
    Precision_Set = []
    X_Set = []
    Coefficient_Set = []
    Avg_PathLen_Set = []
    for key in sorted(Rank_List_Set.keys()):
        Rank_List_Sorted = sorted(Rank_List_Set[key][0], key=lambda edge: edge[2], reverse=True)
        Top_L_Rank_List = Rank_List_Sorted[0:Top_L]
        Coefficient_Set.append(Rank_List_Set[key][1])
        Avg_PathLen_Set.append(Rank_List_Set[key][2])
        #AUC_Set.append(Rank_List_Set[key][3])
        #print key, Performance_Evaluation_Precision(Top_L_Rank_List, Probe_Set)
        X_Set.append(float(key)/Edge_Num)
        Precision_Set.append(Performance_Evaluation_Precision(Top_L_Rank_List, Probe_Set))
        '''
        #Draw Curve Graph
        if key%100 == 0:
            data = []
            for edge in Rank_List_Sorted:
                data.append(edge[2])
            matploit(data)
        '''
    #end for
    print "*Different deleted links ratio:", X_Set
    print "*Precision_Set with different deleted links ratio:", Precision_Set
    print "*Coefficient_Set:", Coefficient_Set
    print "*Avg_PathLen_Set:", Avg_PathLen_Set
    print "*AUC Value:", AUC


    return 1
示例#27
0
def add_random_edges(G, pct):
    """Add `n` random edges to G (`n` = fraction of current edge count)

    Parameters
    ----------
    G : a networkx.Graph
        the network

    pct : float
        A percentage (between 0 and 1)
    """
    assert_is_percentage(pct)
    m = G.size()
    to_add = int(m * pct)
    log.debug("Will add %d edges to %d (%f)", to_add, m, pct)

    new_edges = set(nx.non_edges(G))
    G.add_edges_from(random.sample(new_edges, to_add), weight=1)
示例#28
0
def _apply_prediction(G, func, ebunch=None):
    """Applies the given function to each edge in the specified iterable
    of edges.

    `G` is an instance of :class:`networkx.Graph`.

    `func` is a function on two inputs, each of which is a node in the
    graph. The function can return anything, but it should return a
    value representing a prediction of the likelihood of a "link"
    joining the two nodes.

    `ebunch` is an iterable of pairs of nodes. If not specified, all
    non-edges in the graph `G` will be used.

    """
    if ebunch is None:
        ebunch = nx.non_edges(G)
    return ((u, v, func(u, v)) for u, v in ebunch)
示例#29
0
def Drift_Prediction_Experiment(G, Predictor, Probe_Set, Top_L, Deleted_Ratio):
    print "Drift_Prediction_Experiment!"
    Edge_Num = float(G.number_of_edges())

    #AUC = Performance_Evaluation_AUC(Predictor, G, Probe_Set)
    Unobserved_links = nx.non_edges(G)
    #Unobserved_links = list(Unobserved_links)
    #print Unobserved_links
    #print Probe_Set


    Non_existing_links = list(set(Unobserved_links).difference(set(Probe_Set)))
    AUC = Performance_Evaluation_AUC(Predictor, G, Probe_Set, Non_existing_links)

    #***Prediction with different training set proportion***
    t1 = time.time()
    Rank_List_Set = Prediction_LinkScores_Ratio(G, Predictor, Deleted_Ratio, 50, 30) #Prediction_LinkScores_Ratio(G, Predictor, Proportion, Toleration, Predict_Gap)
    t2 = time.time()
    print "Prediction index time",t2-t1

    #----Performance Evaluation with Precision under different Training Data Ratio----
    Precision_Set = []
    X_Set = []
    Coefficient_Set = []
    Avg_PathLen_Set = []
    for key in sorted(Rank_List_Set.keys()):
        Rank_List_Sorted = sorted(Rank_List_Set[key][0], key=lambda edge: edge[2], reverse=True)
        Top_L_Rank_List = Rank_List_Sorted[0:Top_L]
        Coefficient_Set.append(Rank_List_Set[key][1])
        Avg_PathLen_Set.append(Rank_List_Set[key][2])
        X_Set.append(float(key)/Edge_Num)
        Precision_Set.append(Performance_Evaluation_Precision(Top_L_Rank_List, Probe_Set))
    #end for
    print "*Drift_Different deleted links ratio:", X_Set
    print "*Drift_Precision_Set with different deleted links ratio:", Precision_Set
    print "*Drift_Coefficient_Set:", Coefficient_Set
    print "*Drift_Avg_PathLen_Set:", Avg_PathLen_Set
    print "*Drift_AUC Value:", AUC

    return 1
示例#30
0
def jaccard_predictions(G):
    """
    Create a ranked list of possible new links based on the Jaccard similarity,
    defined as the intersection of nodes divided by the union of nodes
    
    parameters
    G: Directed or undirected nx graph
    returns
    list of linkbunches with the score as an attribute
    """
    potential_edges = []
    for non_edge in nx.non_edges(G):
        u = set(G.neighbors(non_edge[0]))
        v = set(G.neighbors(non_edge[1]))
        if len(u.union(v)) == 0:
            s = 0.0
        else:
            s = (1.0*len(u.intersection(v)))/len(u.union(v))
        non_edge = non_edge + ({'score': s},)
        potential_edges.append(non_edge)
        
    return potential_edges
示例#31
0
    def prediction(self):

        highest_betweenness = dict()

        if not self.betweeness_value:
            for community in self.communities:
                print("Getting betweenness for community {}".format(community))
                subgraph = nx.subgraph(self.graph, self.communities[community])
                highest_betweenness[
                    community] = self._LinkWithBetweenness__get_betweenness(
                        subgraph)

            print("Betweenness done")
            write_dict_to_json(highest_betweenness, self.filename,
                               "../betweenness/")
            print("Betweenness values written")
        else:
            print("Betweenness provided")
            highest_betweenness = self.betweeness_value

        highest_betweenness_left = highest_betweenness["0"]
        highest_betweenness_right = highest_betweenness["1"]
        non_connected_nodes = list(nx.non_edges(self.graph))
        n_possible_new_connections = len(non_connected_nodes)
        non_connected_nodes = list(
            filter(
                lambda x: (x[0] in highest_betweenness_right and x[1] in
                           highest_betweenness_left) or
                (x[0] in highest_betweenness_left and x[1] in
                 highest_betweenness_right), non_connected_nodes))
        ranked_betweenness_nodes = self._LinkWithBetweenness__get_highest_betweenness(
            non_connected_nodes, highest_betweenness_left,
            highest_betweenness_right)

        if not self.values:
            algorithm = None
            print("Combining betweenness with {}".format(
                self.algorithm.lower()))
            if self.algorithm.upper() == TypeOfAlgorithm.ADAMIC_ADAR.value:
                algorithm = nx.adamic_adar_index
            elif self.algorithm == TypeOfAlgorithm.JACCARD_COEFFICIENT.value:
                algorithm = nx.jaccard_coefficient
            elif self.algorithm == TypeOfAlgorithm.RESOURCE_ALLOCATION.value:
                algorithm = nx.resource_allocation_index
            elif self.algorithm == TypeOfAlgorithm.PREFERENTIAL_ATTACHMENT.value:
                algorithm = nx.preferential_attachment

            ranked_similarity_nodes = list(
                sorted(algorithm(self.graph, non_connected_nodes),
                       key=lambda element: element[2],
                       reverse=True))

            write_dict_to_json({"values": ranked_similarity_nodes},
                               self.filename, f"../{self.algorithm.lower()}/")
            print(f"{self.algorithm} values written")
        else:
            print(f"{self.algorithm.lower()} provided")
            ranked_similarity_nodes = list(self.values.values())[0]
            ranked_similarity_nodes = list(map(tuple, ranked_similarity_nodes))

        scores = self.__combine_scores(ranked_betweenness_nodes,
                                       ranked_similarity_nodes)
        scores = {
            k: v
            for k, v in sorted(
                scores.items(), key=lambda item: item[1], reverse=True)
        }

        max_links = len(scores)
        number_edges = round(self.k, self.n_edges)
        all_possible_new_edges = scores.keys()
        if number_edges < max_links:
            edges_to_add = islice(all_possible_new_edges, number_edges)
        else:
            edges_to_add = all_possible_new_edges

        self.percentage_edges_added = self.k
        print("% of edges added: {}".format(self.percentage_edges_added))
        print("Adding {} edges".format(number_edges))

        for edge in edges_to_add:
            self.link_nodes(edge[0], edge[1])
示例#32
0
def MI3(G):
    #G = nx.read_edgelist(graph_file)
    #G = nx.read_edgelist(graph_file, nodetype=int)# 将点类型改为int,使点标示和序号对应
    node_num = nx.number_of_nodes(G)
    edge_num = nx.number_of_edges(G)

    nodes = nx.nodes(G)

    beta = -math.log2(0.0001)

    # 首先计算$P(L^1_{xy})$,其实不需要计算顶点对之间的概率,只需要不同度之间的概率
    nodes_Degree_dict = {}
    degree_list = []

    for v in nodes:
        nodes_Degree_dict[v] = nx.degree(G, v)
        degree_list.append(nx.degree(G, v))

    #degree_list = [nx.degree(G, v) for v in range(node_num)]#序号和点的值一一对应

    distinct_degree_list = list(set(degree_list))
    size = len(distinct_degree_list)

    self_Connect_dict = {}

    for x in range(size):
        k_x = distinct_degree_list[x]
        for y in range(x, size):
            k_y = distinct_degree_list[y]

            p0 = 1
            (k_n, k_m) = pair(k_x, k_y)
            a = edge_num + 1
            b = edge_num - k_m + 1
            for i in range(1, k_n + 1):
                p0 *= (b - i) / (a - i)
            # end for
            if p0 == 1:
                self_Connect_dict[(k_n, k_m)] = beta
                self_Connect_dict[(k_m, k_n)] = beta
            else:
                self_Connect_dict[(k_n, k_m)] = -math.log2(1 - p0)
                self_Connect_dict[(k_m, k_n)] = -math.log2(1 - p0)

    # 计算以z为公共邻居的两个顶点间存在链接的互信息
    #mutual_info_list = [0 for z in range(node_num)]

    self_Conditional_dict = {}
    for z in nodes:
        k_z = nodes_Degree_dict[z]
        if k_z > 1:
            alpha = 2 / (k_z * (k_z - 1))
            cc_z = nx.clustering(G, z)
            if cc_z == 0:
                log_c = beta
            else:
                log_c = -math.log2(cc_z)
            # end if
            s = 0
            neighbor_list = nx.neighbors(G, z)
            size = len(neighbor_list)
            for i in range(size):
                m = neighbor_list[i]
                for j in range(i + 1, size):
                    n = neighbor_list[j]
                    if i != j:
                        s += (self_Connect_dict[(nodes_Degree_dict[m],
                                                 nodes_Degree_dict[n])] -
                              log_c)
            self_Conditional_dict[z] = alpha * s

    sim_dict = {}  # 存储相似度的字典
    ebunch = nx.non_edges(G)

    i = 0
    for x, y in ebunch:
        s = 0
        #(k_x, k_y) = pair(degree_list[x], degree_list[y])
        for z in nx.common_neighbors(G, x, y):
            s += self_Conditional_dict[z]
        sim_dict[(x, y)] = s - self_Connect_dict[(nodes_Degree_dict[x],
                                                  nodes_Degree_dict[y])]
        #sim_dict[(y, x)] = s - self_Connect_dict[(degree_list[x], degree_list[y])]
        # end if
    # end for
    print(sim_dict)
    return sim_dict
示例#33
0
def new_connections_predictions():

    # Your Code Here
    #edges metrics: common neighbors,

    #get common neighbors
    n_common_neighbors = [((e[0], e[1]),
                           len(sorted(nx.common_neighbors(G, e[0], e[1]))))
                          for e in nx.non_edges(G)]

    #Jaccard coefficient
    jaccard_coe = [((e[0], e[1]), e[2]) for e in nx.jaccard_coefficient(G)]

    #research allocation
    resource_allocation = [((e[0], e[1]), e[2])
                           for e in nx.resource_allocation_index(G)]

    #adamic_adar index

    adami_adar = [((e[0], e[1]), e[2]) for e in nx.adamic_adar_index(G)]

    #preferential attachement
    pref_attachement = [((e[0], e[1]), e[2])
                        for e in nx.preferential_attachment(G)]

    def convert_score_to_series(tupples):
        index = [edge[0] for edge in tupples]
        scores = [edge[1] for edge in tupples]
        scores = pd.Series(scores, index=index)
        return scores

    n_common_neighbors = convert_score_to_series(n_common_neighbors)
    jaccard_coe = convert_score_to_series(jaccard_coe)
    resource_allocation = convert_score_to_series(resource_allocation)
    adami_adar = convert_score_to_series(adami_adar)
    pref_attachement = convert_score_to_series(pref_attachement)

    non_edges_df = pd.concat([
        n_common_neighbors, jaccard_coe, resource_allocation, adami_adar,
        pref_attachement
    ],
                             axis=1)
    non_edges_df.columns = [
        'n_common_neighbors', 'jaccard_coe', 'resource_allocation',
        'adami_adar', 'pref_attachement'
    ]
    non_edges_df = non_edges_df.join(future_connections, how='outer')

    validation = non_edges_df[non_edges_df['Future Connection'].isnull()]
    training = non_edges_df[non_edges_df['Future Connection'].notnull()]

    y = training['Future Connection']
    x = training.drop(['Future Connection'], axis=1)
    validation = validation.drop(['Future Connection'], axis=1)

    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import auc

    gbc = GradientBoostingClassifier()
    lr = LogisticRegression()

    # parameters = {'n_estimators' : [100, 200, 300],
    #               'max_depth' : [3,5,10],
    #               'random_state' : [42]
    #              }

    parameters = {'penalty': ['l1', 'l2'], 'C': [1, 2], 'random_state': [42]}

    gs = GridSearchCV(lr, parameters, scoring='roc_auc', cv=10)
    gs.fit(x, y)

    prediction = gs.predict_proba(validation)[:, 1]
    prediction = pd.Series(prediction, index=validation.index)
    return prediction  # Your Answer Here
示例#34
0
def _bonds_from_names(graph, resname, nodes, force_field):
    """Add edges between `nodes` in `graph` based on atom names.

    Adds edges to `graph`, assuming the nodes in `nodes` constitute a residue
    with residue name `resname`, which can be found among the `force_field`
    blocks. Edges will be added as they are in the reference Block. In addition,
    all non-edges in the Block will be generated and returned.

    Parameters
    ----------
    graph: networkx.Graph
    resname: str
    nodes: collections.abc.Iterable[collections.abc.Hashable]
        Should be node keys in `graph`
    force_field: vermouth.forcefield.ForceField
        Force field in which to look for the block with name `resname`

    Raises
    ------
    KeyError
        If `resname` is not one of the blocks known to `force_field`; or when
        a residue contains duplicate atom names.

    Returns
    -------
    Set[Frozenset[collections.abc.Hashable, collections.abc.Hashable]]
        All non-edges found in the block, with node keys from `graph`.
    """
    block = force_field.blocks.get(resname)
    if not block:
        raise KeyError("Residue {} is not known to force field {}"
                       "".format(resname, force_field.name))

    mol_name_to_idx = defaultdict(set)
    for graph_idx in nodes:
        if 'atomname' in graph.nodes[graph_idx]:
            mol_name_to_idx[graph.nodes[graph_idx]['atomname']].add(graph_idx)
    mol_name_to_idx = dict(mol_name_to_idx)
    for name, graph_idxs in mol_name_to_idx.items():
        if len(graph_idxs) > 1:
            raise KeyError("Residue has multiple atoms with atom name {}"
                           "".format(name))
        mol_name_to_idx[name] = mol_name_to_idx[name].pop()

    for block_idx, block_jdx in block.edges:
        block_idx_name = block.nodes[block_idx]['atomname']
        block_jdx_name = block.nodes[block_jdx]['atomname']
        if block_idx_name in mol_name_to_idx and block_jdx_name in mol_name_to_idx:
            graph_idx = mol_name_to_idx[block_idx_name]
            graph_jdx = mol_name_to_idx[block_jdx_name]
            pos1 = np.array(graph.nodes[graph_idx].get('position', np.full(3, np.nan)))
            pos2 = np.array(graph.nodes[graph_jdx].get('position', np.full(3, np.nan)))
            dist = np.sqrt(np.sum((pos1 - pos2)**2))
            graph.add_edge(graph_idx, graph_jdx, distance=dist)

    non_edges = set()
    for block_idx, block_jdx in nx.non_edges(block):
        block_idx_name = block.nodes[block_idx]['atomname']
        block_jdx_name = block.nodes[block_jdx]['atomname']
        if block_idx_name in mol_name_to_idx and block_jdx_name in mol_name_to_idx:
            non_edges.add(frozenset((mol_name_to_idx[block_idx_name],
                                     mol_name_to_idx[block_jdx_name])))
    return non_edges
示例#35
0
def devide(category, dataname, ratio):
    print dataname
    G = nx.read_weighted_edgelist('./data/' + category + '/' + dataname +
                                  '.txt',
                                  nodetype=int)
    nonit = nx.non_edges(G)
    n = nx.number_of_nodes(G)
    n = n * (n - 1) / 2
    nonedge = n - nx.number_of_edges(G)
    e = nx.number_of_edges(G)
    e = long(ratio * e)
    count = 0
    nonedgechoose = []
    while (True):
        tmp = np.random.random_integers(0, nonedge)
        if tmp not in nonedgechoose:
            nonedgechoose.append(tmp)
            count = count + 1
        if count >= e:
            break
    nonedgechoose.sort()
    count = 0
    G_neg = nx.Graph()
    for i in nonedgechoose:
        while count < i:
            next(nonit)
            count = count + 1
        G_neg.add_edge(*next(nonit))
        count = count + 1
    it = nx.edges(G)
    n = nx.number_of_edges(G)
    n = long(n * ratio)
    count = 0
    edgechoose = []
    while (True):
        tmp = np.random.random_integers(0, nx.number_of_edges(G))
        if tmp not in edgechoose:
            edgechoose.append(tmp)
            count = count + 1
        if count >= n:
            break
    edgechoose.sort()
    G_train = nx.Graph()
    G_pos = nx.Graph()
    count = 0
    index = 0
    print len(edgechoose)
    for edge in it.data(False):
        if index >= len(edgechoose):
            G_train.add_edge(*edge)
            continue
        if count != edgechoose[index]:
            G_train.add_edge(*edge)
            count = count + 1
            continue
        G_pos.add_edge(*edge)
        count = count + 1
        index = index + 1
    G_train = nx.DiGraph(G_train)
    nx.write_edgelist(G_train,
                      'dividedata/' + category + '/' + dataname + '.txt',
                      data=False)
    nx.write_edgelist(G_pos,
                      'dividedata/' + category + '/' + dataname + '_pos.txt',
                      data=False)
    nx.write_edgelist(G_neg,
                      'dividedata/' + category + '/' + dataname + '_neg.txt',
                      data=False)
    print 'end'
示例#36
0
#Graph edges in list form
#Medges = [i for i in M.edges()]

#Layout
#pos=nx.fruchterman_reingold_layout(M, dim=2)
N = len(M.nodes())
labels = [i[1]['name'] for i in M.nodes(data=True)]

# ###################### Evolution #########################

import operator

# Common Neighbors
CN = [(e[0], e[1], len(list(nx.common_neighbors(M, e[0], e[1]))))
      for e in nx.non_edges(M)]
CN.sort(key=operator.itemgetter(2), reverse=True)

# Jaccard coef
jaccard = list(nx.jaccard_coefficient(M))
jaccard.sort(key=operator.itemgetter(2), reverse=True)

# Resource Allocation index
RA = list(nx.resource_allocation_index(M))
RA.sort(key=operator.itemgetter(2), reverse=True)

# Adamic-Adar index
AA = list(nx.adamic_adar_index(M))
AA.sort(key=operator.itemgetter(2), reverse=True)

# Preferential Attachement
示例#37
0
def main():
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

    #specify initial arguments for all functions
    manager = multiprocessing.Manager()
    network = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml').to_undirected()
    nonexist_edges = manager.list(list(nx.non_edges(network)))
    nonexist_edges = list(nx.non_edges(network))  # non-existent edges in graph
    iterations = 100
    steps = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95]
    file = 'Results/Trametinib/NETS_Tram_'

    pool = multiprocessing.Pool(processes=4)  # set up pool

    #Degree Product
    func = partial(DPFracAUC, network, nonexist_edges, iterations)
    DPres = pool.map(func, steps)
    print 'Finished running Degree Product'
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #write dictionary to json file
    with open(str(file) + 'DP.json', 'w') as fout:
        json.dump(DPres, fout)

    #Shortest Path
    func2 = partial(SPFracAUC, network, nonexist_edges, iterations)
    SPres = pool.map(func2, steps)
    print 'Finished running Shortest Path'
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #write dictionary to json file
    with open(str(file) + 'SP.json', 'w') as fout:
        json.dump(SPres, fout)

    #Common Neighbors
    func3 = partial(CNFracAUC, network, nonexist_edges, iterations)
    CNres = pool.map(func3, steps)
    print 'Finished running Common Neighbors'
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #write dictionary to json file
    with open(str(file) + 'CN.json', 'w') as fout:
        json.dump(CNres, fout)

    #Jaccard
    func4 = partial(JFracAUC, network, nonexist_edges, iterations)
    Jres = pool.map(func4, steps)
    print 'Finished running Jaccard Index'
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #write dictionary to json file
    with open(str(file) + 'J.json', 'w') as fout:
        json.dump(Jres, fout)

    #Sorensen Similarity
    func5 = partial(SSFracAUC, network, nonexist_edges, iterations)
    SSres = pool.map(func5, steps)
    print 'Finished running Sorensen Similarity'
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #write dictionary to json file
    with open(str(file) + 'SS.json', 'w') as fout:
        json.dump(SSres, fout)

    #Leicht-Holme-Newman
    func6 = partial(LHNFracAUC, network, nonexist_edges, iterations)
    LHNres = pool.map(func6, steps)
    print 'Finished running Leicht-Holme-Newman'
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #write dictionary to json file
    with open(str(file) + 'LHN.json', 'w') as fout:
        json.dump(LHNres, fout)

    #Adamic Advar
    func7 = partial(AAFracAUC, network, nonexist_edges, iterations)
    AAres = pool.map(func7, steps)
    print 'Finished running Adamic Advar'
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #write dictionary to json file
    with open(str(file) + 'AA.json', 'w') as fout:
        json.dump(AAres, fout)

    #Resource Allocation
    func8 = partial(RAFracAUC, network, nonexist_edges, iterations)
    RAres = pool.map(func8, steps)
    print 'Finished running Resource Allocation'
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #write dictionary to json file
    with open(str(file) + 'RA.json', 'w') as fout:
        json.dump(RAres, fout)

    #Katz
    func9 = partial(KFracAUC, network, nonexist_edges, iterations)
    Kres = pool.map(func9, steps)
    print 'Finished running Katz'
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    #write dictionary to json file
    with open(str(file) + 'K.json', 'w') as fout:
        json.dump(Kres, fout)

    # #Simrank
    # func10 = partial(SFracAUC, network, nonexist_edges, iterations)
    # Sres = pool.map(func10, steps)
    # print 'Finished running SimRank'
    # # write dictionary to json file
    # with open(str(file) + 'SR.json', 'w') as fout:
    #     json.dump(Sres, fout)

    # Rooted Page Rank
    func11 = partial(PRFracAUC, network, nonexist_edges, iterations)
    RPRres = pool.map(func11, steps)
    print 'Finished running Rooted Page Rank'
    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    # write dictionary to json file
    with open(str(file) + 'RPR.json', 'w') as fout:
        json.dump(RPRres, fout)


    pool.close()
    pool.join()

    print str('Finished running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
示例#38
0
def preferential_attachment_score(graph):
    non_edges = nx.non_edges(graph)
    return ((u, v, graph.degree(u) * graph.degree(v)) for u, v in non_edges)
示例#39
0
def star_with_extra_edges(N, M):
    g = nx.star_graph(N)
    g.add_edges_from(random.sample(set(nx.non_edges(g)), M - len(g.edges())))
    return g
data1 = data[data[:][6] > 0]
data2 = data1.iloc[:, [0, 1]]
data3 = data2.drop_duplicates()
# data3.to_csv('./edges.csv',index=False)
# In[4]:

G = nx.read_edgelist('./edges.csv', delimiter=',', create_using=nx.Graph())
# nodes=pd.DataFrame(list(G.nodes()))
# # nodes.to_csv('./all_nodes.csv',index=False)
G08 = nx.read_edgelist('./edges08.csv', delimiter=',', create_using=nx.Graph())
G09 = nx.read_edgelist('./edges09.csv', delimiter=',', create_using=nx.Graph())
G08.add_nodes_from(G.nodes(data=True))
G09.add_nodes_from(G.nodes(data=True))
edges08 = pd.DataFrame(list(G08.edges()))
edges09 = pd.DataFrame(list(G09.edges()))
non_edges08 = pd.DataFrame(nx.non_edges(G08))
non_edges09 = pd.DataFrame(nx.non_edges(G09))
edges08.columns = ['Departure', 'locationID']
edges09.columns = ['Departure', 'locationID']
non_edges08.columns = ['Departure', 'locationID']
non_edges09.columns = ['Departure', 'locationID']
edges08['label'] = 1
edges09['label'] = 1
non_edges08['label'] = 0
non_edges09['label'] = 0
train08 = pd.concat([edges08, non_edges08])
test09 = pd.concat([edges09, non_edges09])

# In[5]:

train_data = np.array(train08)
示例#41
0
    def make_pairs_with_edges(self, label_graph, target_positive_ratio=.5, enforce_non_edge=True, enforce_has_embeddings=False):
        """
        Generate a dataframe with a fixed ratio of positives to negatives by requiring all new edges in
        label_graph to appear in the dataframe.

        :param label_graph: The graph to check for new edges
        :param target_positive_ratio: Ratio of positive to negative (default=.5)
        :return: A list of tuples containing target_positive_ratio edges to non-edges
        """

        pairs = []
        pairs_dict = defaultdict(bool)
        edges = 0

        if target_positive_ratio == 0:
            # We want all the pairs from label_graph
            # todo: do we need pairs_dict for this part
            for u, v in label_graph.nx_graph.edges_iter():
                if enforce_has_embeddings:
                    if u not in self.embeddings or v not in self.embeddings:
                        continue
                edges += 1
                pairs.append((u, v))
            for u, v in nx.non_edges(label_graph.nx_graph):
                if enforce_has_embeddings:
                    if u not in self.embeddings or v not in self.embeddings:
                        continue
                pairs.append((u, v))
            print("\t%d edges out of %d pairs" % (edges, len(pairs)))
            return pairs

        for u, v in label_graph.nx_graph.edges_iter():
            if enforce_has_embeddings and not self.embeddings:
                print("No embeddings found! Error!")
                return
            if enforce_has_embeddings:
                if u not in self.embeddings or v not in self.embeddings:
                    continue
            if (enforce_non_edge and not self.nx_graph.has_edge(u, v)) or not enforce_non_edge:
                u, v = sorted((u, v))
                if not pairs_dict[(u, v)]:
                    pairs_dict[(u, v)] = True
                    pairs.append((u, v))
                    edges += 1

        nodes = self.embeddings.keys()
        added = 0
        rejected = 0
        while float(edges) / len(pairs) > target_positive_ratio:
            u = nodes[int(random.random() * len(nodes))]
            v = nodes[int(random.random() * len(nodes))]
            if label_graph.nx_graph.has_edge(u, v) or u == v:
                rejected += 1
                continue
            if enforce_has_embeddings:
                if u not in self.embeddings or v not in self.embeddings:
                    rejected += 1
                    continue
            (u, v) = sorted((u, v))
            if not pairs_dict[(u, v)]:
                pairs_dict[(u, v)] = True
                pairs.append((u, v))
                added += 1
        return pairs
示例#42
0
def LP(graph_file, out_file, sim_method, t, p):

    G = nx.read_edgelist(graph_file, nodetype=int)
    #G = G.to_undirected()
    #G = nx.convert_node_labels_to_integers(G)

    # for debug
    # print(nx.nodes(G))

    node_num = nx.number_of_nodes(G)
    edge_num = nx.number_of_edges(G)

    # 列出所有不存在的链接,存放到non_edge_list中
    # non_edge_num = (node_num * (node_num - 1)) / 2 - edge_num
    non_edge_list = [pair(u, v) for u, v in nx.non_edges(G)]
    non_edge_num = len(non_edge_list)

    # for debug
    print("V: %d\tE: %d\tNon: %d" % (node_num, edge_num, non_edge_num))

    # for debug
    #    print(len(non_edge_list))
    # print(non_edge_list)

    # 执行t次独立的实验,每次从G中选择p*100%的链接作为测试集,剩余的链接作为训练集
    test_num = int(edge_num * p)
    pre_num = 0

    for l in range(2, 101, 2):
        if l < 20:
            pre_num += 1
        else:
            break
        # end if
    # end for
    pre_num += 1

    # for debug
    print('test_edge_num: %d' % test_num)

    # 定义数组存放性能值
    auc_list = []
    rs_list = []
    time_list = []
    pre_matrix = [[0 for it in range(t)] for num in range(pre_num)]

    # 迭代t次进行测试
    for it in range(t):
        if it % 10 == 0:
            print('turn: %d' % it)
        # end if

        # 首先产生一批随机数
        seed = math.sqrt(edge_num * node_num) + math.pow(
            (1 + it) * 10, 3)  # 随机数种子
        random.seed(seed)
        rand_set = set(random.sample(range(edge_num), test_num))

        # rand_set = set()
        # i = 0
        # while (i < test_num):
        # 	r = random.randint(0, edge_num - 1)
        # 	if (r not in rand_set):
        # 		rand_set.add(r)
        # 		i += 1
        # 	# end if
        # # end while

        # for debug
        # print(rand_set)
        # print(len(rand_set))

        # 遍历G中链接,根据rand_set中的值分成训练集和测试集
        training_graph = nx.Graph()
        training_graph.add_nodes_from(range(node_num))
        test_edge_list = []

        r = 0
        for u, v in nx.edges_iter(G):
            u, v = pair(u, v)
            # for debug
            # print(u, v)
            if r in rand_set:  # 测试链接
                test_edge_list.append((u, v))
            else:
                training_graph.add_edge(u, v)  # 训练网络
            # end if
            r += 1
        # end for
        training_graph.to_undirected()

        # for debug
        # print(len(test_edge_list))
        # print(test_edge_list)
        # print(nx.number_of_edges(training_graph))
        # print(nx.number_of_nodes(training_graph))
        # print(nx.nodes(training_graph))
        # print(nx.edges(training_graph))

        # 计算相似度
        # if (it % 10 == 0):
        # print('计算相似度')

        start = datetime.datetime.now()
        sim_dict = similarities(training_graph, sim_method)
        end = datetime.datetime.now()

        # 0. 计算时间
        time_list.append((end - start).microseconds)

        # 1. 计算AUC
        auc_value = AUC(sim_dict, test_edge_list, non_edge_list)
        auc_list.append(auc_value)
        # for debug
        # print(auc_value)

        # 创建一个数组,存放顶点对的相似度
        sim_list = [((u, v), s) for (u, v), s in sim_dict.items()]

        # sim_dict不在需要
        sim_dict.clear()

        # 对sim_list按照相似度降序排列
        sim_list.sort(key=lambda x: (x[1], x[0]), reverse=True)

        # 2. 计算Ranking Score
        rank_score = Ranking_score(sim_list, test_edge_list, non_edge_num)
        rs_list.append(rank_score)
        # for debug
        # print(rank_score)

        # 3. 计算精度列表
        pre_list = Precision(sim_list, test_edge_list, test_num)

        for num in range(pre_num):
            pre_matrix[num][it] = pre_list[num]
        # end for
    # end for

    # 计算平均值和方差,并将结果输出到文件
    auc_avg, auc_std = stats(auc_list)

    print('AUC: %.4f(%.4f)' % (auc_avg, auc_std))
    out_file.write('%.4f(%.4f)\t' % (auc_avg, auc_std))

    rs_avg, rs_std = stats(rs_list)

    print('Ranking_Score: %.4f(%.4f)' % (rs_avg, rs_std))
    out_file.write('%.4f(%.4f)\t' % (rs_avg, rs_std))

    time_avg, time_std = stats(time_list)

    print('Time: %.4f(%.4f)' % (time_avg, time_std))
    out_file.write('%.4f(%.4f)\t' % (time_avg, time_std))

    pre_avg_list = []
    pre_std_list = []
    for num in range(pre_num):
        pre_avg, pre_std = stats(pre_matrix[num])
        pre_avg_list.append(pre_avg)
        pre_std_list.append(pre_std)
    # end for

    print('Precision: ')
    # out_file.write('\nPrecision: ')
    for num in range(pre_num):
        print('%.4f(%.4f)\t' % (pre_avg_list[num], pre_std_list[num]))
        out_file.write('%.4f(%.4f)\t' % (pre_avg_list[num], pre_std_list[num]))
    # end for

    out_file.write('%d\n' % test_num)
示例#43
0
        def sample_subgraph(graph,
                            offset=0,
                            use_precomp_sizes=False,
                            filter_negs=False,
                            supersample_small_graphs=False,
                            neg_target=None,
                            hard_neg_idxs=None):
            if neg_target is not None: graph_idx = graph.G.graph["idx"]
            use_hard_neg = (hard_neg_idxs is not None
                            and graph.G.graph["idx"] in hard_neg_idxs)
            done = False
            n_tries = 0
            while not done:
                if use_precomp_sizes:
                    size = graph.G.graph["subgraph_size"]
                else:
                    if train and supersample_small_graphs:
                        sizes = np.arange(self.min_size + offset,
                                          len(graph.G) + offset)
                        ps = (sizes - self.min_size + 2)**(-1.1)
                        ps /= ps.sum()
                        size = stats.rv_discrete(values=(sizes, ps)).rvs()
                    else:
                        d = 1 if train else 0
                        size = random.randint(self.min_size + offset - d,
                                              len(graph.G) - 1 + offset)
                start_node = random.choice(list(graph.G.nodes))
                neigh = [start_node]
                frontier = list(
                    set(graph.G.neighbors(start_node)) - set(neigh))
                visited = set([start_node])
                while len(neigh) < size:
                    new_node = random.choice(list(frontier))
                    assert new_node not in neigh
                    neigh.append(new_node)
                    visited.add(new_node)
                    frontier += list(graph.G.neighbors(new_node))
                    frontier = [x for x in frontier if x not in visited]
                if self.node_anchored:
                    anchor = neigh[0]
                    for v in graph.G.nodes:
                        graph.G.nodes[v]["node_feature"] = (
                            torch.ones(1) if anchor == v else torch.zeros(1))
                        #print(v, graph.G.nodes[v]["node_feature"])
                neigh = graph.G.subgraph(neigh)
                if use_hard_neg and train:
                    neigh = neigh.copy()
                    if random.random(
                    ) < 1.0 or not self.node_anchored:  # add edges
                        non_edges = list(nx.non_edges(neigh))
                        if len(non_edges) > 0:
                            for u, v in random.sample(
                                    non_edges,
                                    random.randint(1, min(len(non_edges), 5))):
                                neigh.add_edge(u, v)
                    else:  # perturb anchor
                        anchor = random.choice(list(neigh.nodes))
                        for v in neigh.nodes:
                            neigh.nodes[v]["node_feature"] = (torch.ones(1) if
                                                              anchor == v else
                                                              torch.zeros(1))

                if (filter_negs and train and len(neigh) <= 6
                        and neg_target is not None):
                    matcher = nx.algorithms.isomorphism.GraphMatcher(
                        neg_target[graph_idx], neigh)
                    if not matcher.subgraph_is_isomorphic(): done = True
                else:
                    done = True

            return graph, DSGraph(neigh)
示例#44
0
    def generate_pos_neg_links(self):

        # Select n edges at random (positive samples)
        n_edges = self.G.number_of_edges()
        n_nodes = self.G.number_of_nodes()
        npos = int(self.prop_pos * n_edges)
        nneg = int(self.prop_neg * n_edges)

        n_neighbors = [len(list(self.G.neighbors(v))) for v in self.G.nodes()]
        n_non_edges = n_nodes - 1 - np.array(n_neighbors)

        non_edges = [e for e in nx.non_edges(self.G)]
        if VERBOSE:
            print("\tFinding %d of %d non-edges" % (nneg, len(non_edges)))

        # Select m pairs of non-edges (negative samples)
        rnd_inx = self._rnd.choice(len(non_edges), nneg, replace=False)
        neg_edge_list = [non_edges[ii] for ii in rnd_inx]

        if len(neg_edge_list) < nneg:
            raise RuntimeWarning("\tOnly %d negative edges found" %
                                 (len(neg_edge_list)))

        if VERBOSE:
            print("\tFinding %d positive edges of %d total edges" %
                  (npos, n_edges))

        # Find positive edges, and remove them.
        edges = self.G.edges()

        edges = list(edges)

        pos_edge_list = []
        n_count = 0
        n_ignored_count = 0

        rnd_inx = self._rnd.permutation(n_edges)

        for eii in rnd_inx.tolist():
            edge = edges[eii]

            # Remove edge from graph
            data = self.G[edge[0]][edge[1]]
            self.G.remove_edge(*edge)

            reachable_from_v1 = nx.connected._plain_bfs(self.G, edge[0])
            if edge[1] not in reachable_from_v1:
                self.G.add_edge(*edge, **data)
                n_ignored_count += 1
            else:
                pos_edge_list.append(edge)
                if VERBOSE:
                    sys.stdout.write("\r" +
                                     "\tFound: {} edges".format(n_count + 1))
                n_count += 1

            if n_count >= npos:
                break

        if VERBOSE:
            sys.stdout.write("\n")

        edges_num = len(pos_edge_list)
        self._pos_edge_list = pos_edge_list
        self._neg_edge_list = neg_edge_list

        # print('pos_edge_list', len(self._pos_edge_list))
        # print('neg_edge_list', len(self._neg_edge_list))
        if VERBOSE:
            print("\tEdge list lengths: Pos: {} Neg: {}".format(
                len(self._pos_edge_list), len(self._neg_edge_list)))
示例#45
0
def generate_pos_neg_links(nx_graph, merge_network, test_para):
    '''生成正负样例边'''
    Multi_Networks = copy.deepcopy(nx_graph)
    # train_g = copy.deepcopy(merge_network)
    selected_layer = random.randint(0, len(Multi_Networks))
    train_g = copy.deepcopy(Multi_Networks[selected_layer])
    train_ng = Multi_Networks.remove(train_g)
    # 获取网络中存在的边
    exit_edges = list(train_g.edges())
    num_exit = len(exit_edges)

    # 获取网络中不存在的边
    noexit_edges = list(nx.non_edges(train_g))
    num_noexit = len(noexit_edges)

    # 随机化列表的序列
    random.shuffle(exit_edges)
    random.shuffle(noexit_edges)

    # 正例边的采样
    pos_edge_list = []
    n_count = 0
    edges = exit_edges
    rnd = np.random.RandomState(seed=None)
    rnd_inx = rnd.permutation(edges)  # 基于随机种子产生下标
    for eii in rnd_inx:
        edge = eii
        # 删除该边
        data = train_g[edge[0]][edge[1]]
        train_g.remove_edge(*edge)

        # 测试存在的边在删除之后,整个网络能否联通
        if nx.is_connected(train_g):
            flag = True
            for g in Multi_Networks:
                if edge in g.edges():
                    gt = copy.deepcopy(g)
                    gt.remove_edge(*edge)
                    if nx.is_connected(gt) == False:
                        del gt
                        flag = False
                        break
            if flag:
                for g in Multi_Networks:
                    if edge in g.edges():
                        g.remove_edge(*edge)
                pos_edge_list.append(tuple(edge))
                n_count += 1
            else:
                train_g.add_edge(*edge, **data)
        else:
            train_g.add_edge(*edge, **data)

    # 正采样的边
    if not len(pos_edge_list):  # 如果原始图都是空的,那么就没有意义,所以就随机选择一定数量的边
        pos_edge_list = exit_edges[:int(len(exit_edges) * test_para)]
        [
            g.remove_edge(*e) for g in Multi_Networks for e in pos_edge_list
            if e in g.edges()
        ]
        [train_g.remove_edge(*e) for e in pos_edge_list]
        nneg = npos = len(pos_edge_list)
    else:
        # 确定测试边的个数
        if len(pos_edge_list) < num_noexit:
            npos = int(test_para * len(pos_edge_list))  # 正例的数量
        else:
            npos = int(test_para * num_noexit)
        nneg = npos  # 负例的数量
        pos_edge_list = pos_edge_list[:nneg]

    # 负采样的边
    neg_edge_list = noexit_edges[:nneg]
    # 测试边数据集和标签
    test_edges, labels = get_selected_edges(pos_edge_list, neg_edge_list)
    return Multi_Networks, train_g, pos_edge_list, neg_edge_list, test_edges, labels
示例#46
0
def MI(graph_file):
    G = nx.read_edgelist(graph_file)
    node_num = nx.number_of_nodes(G)
    edge_num = nx.number_of_edges(G)
    print(node_num)
    print(edge_num)
    sim_dict = {}  # �洢���ƶȵ��ֵ�

    I_pConnect_dict = {}
    pDisConnect = 1
    edges = nx.edges(G)
    ebunch = nx.non_edges(G)
    nodes = nx.nodes(G)
    nodes_Degree_dict = {}
    for v in nodes:
        nodes_Degree_dict[v] = nx.degree(G, v)

    # 需要经常获取顶点的度,因此,可以事先存储下来
    # degree_list = [nx.degree(G, v) for v in range(G.number_of_nodes())]

    # 下面的两个循环计算$P(L^1_{xy})$,其实我们只需要计算不同度的值$P(L^1_{kxky})$
# =============================================================================
#     degree_I_pConnect = {}
#     for u, v in edges:
# =============================================================================

    for u, v in edges:
        uDegree = nodes_Degree_dict[u]
        vDegree = nodes_Degree_dict[v]
        for i in range(1, vDegree + 1):
            pDisConnect = pDisConnect * (((edge_num - uDegree) - i + 1) /
                                         (edge_num - i + 1))
        pConnect = 1 - pDisConnect
        if pConnect == 0:
            I_pConnect = -math.log2(0.0001)
        else:
            I_pConnect = -math.log2(pConnect)
        I_pConnect_dict[(u, v)] = I_pConnect
        I_pConnect_dict[(v, u)] = I_pConnect
        pDisConnect = 1

    for m, n in ebunch:
        # =============================================================================
        #         mDegree = nx.degree(G, m)
        #         nDegree = nx.degree(G, n)
        # =============================================================================
        mDegree = nodes_Degree_dict[m]
        nDegree = nodes_Degree_dict[n]
        for i in range(1, nDegree + 1):
            pDisConnect = pDisConnect * (((edge_num - mDegree) - i + 1) /
                                         (edge_num - i + 1))
        pConnect = 1 - pDisConnect
        if pConnect == 0:
            I_pConnect = -math.log2(0.0001)
        else:
            I_pConnect = -math.log2(pConnect)
        I_pConnect_dict[(m, n)] = I_pConnect
        I_pConnect_dict[(n, m)] = I_pConnect
        pDisConnect = 1

    ebunchs = nx.non_edges(G)
    i = 0

    # $I(L^1_{xy};z) = I(L^1;z)$,与x, y没有关系,可以先计算出来
    for u, v in ebunchs:

        pMutual_Information = 0
        I_pConnect = I_pConnect_dict[(u, v)]
        for z in nx.common_neighbors(G, u, v):
            neighbor_num = len(list(nx.neighbors(G, z)))
            neighbor_list = nx.neighbors(G, z)
            for m in range(len(neighbor_list)):
                for n in range(m + 1, len(neighbor_list)):
                    if m != n:
                        I_ppConnect = I_pConnect_dict[(neighbor_list[m],
                                                       neighbor_list[n])]
                        if nx.clustering(G, z) == 0:
                            pMutual_Information = pMutual_Information + (
                                2 /
                                (neighbor_num *
                                 (neighbor_num - 1))) * ((I_ppConnect) -
                                                         (-math.log2(0.0001)))
                        else:
                            pMutual_Information = pMutual_Information + (
                                2 / (neighbor_num * (neighbor_num - 1))) * (
                                    (I_ppConnect) -
                                    (-math.log2(nx.clustering(G, z))))
        sim_dict[(u, v)] = -(I_pConnect - pMutual_Information)
        i = i + 1
        #print(i)
        print(str(u) + "," + str(v))
        print(sim_dict[(u, v)])
    return sim_dict
sampleTest=sample(g_training.edges(), sizeTestSet)
#Graph for the test sample
g_test=nx.Graph()  
g_test.add_edges_from(sampleTest)

#remove from g_training the edges in sampleTest
g_training.remove_edges_from(sampleTest)
print(g_training.edges())

#Finally, convert the remaining edges as a series.
samplePositiveTraining=pd.Series(data=g_training.edges())

#3/ To balance the training set, we will randomly pick pairs of unconnected vertices (negative class). 
#The number of pairs should be equal to the number of considered connections (positive class) in the training set. Find a way to generate this negative training set and name it sampleNegativeTraining.
import numpy as np
non_edges = list(nx.non_edges(g_training))

sample_num = len(g_training.edges())
sample = sample(non_edges, sample_num)
sampleNegativeTraining=pd.Series(data=sample)

#add new edges in the training graph based on the negative sample
g_training.add_edges_from(sampleNegativeTraining)



#5/ Use the following code (and modify it if necessary) to create 2 empty data frames (one for the training set and the other for the test set).
import numpy as np
sampleTraining = pd.concat([samplePositiveTraining,sampleNegativeTraining,],ignore_index=True)
dfTraining_1 = pd.DataFrame((list(sampleTraining)), columns=["target","source"])
dfTraining_2 = pd.DataFrame(np.zeros((sizeTrainingSet, 11)), columns=features)
示例#48
0
def read_csv_files():
    """
    Read data and create MultiDiGraph.Each Node has an id and All edges have 2 attributes.
    The first is Timestamp and the second is the type of edge (Attacks, Trades, Messages)
    :return: G, all_dfs, labels
    """
    file_names = glob.glob("../data_users_moves/*.csv")

    all_dfs = pd.DataFrame(columns=['Timestamp', 'id1', 'id2', 'label'])

    for file in file_names:
        print(str(file))
        df = pd.read_csv(file, header=None)
        df.columns = ['Timestamp', 'id1', 'id2']
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
        # df['date'] = [d.date() for d in df['Timestamp']]
        # df['time'] = [d.time() for d in df['Timestamp']]
        if 'attack' in file:
            rel_type = 'attacks'
        elif 'trade' in file:
            rel_type = 'trades'
        else:
            rel_type = 'messages'

        df['type'] = rel_type
        df['weight'] = 1
        df['label'] = 1
        all_dfs = pd.concat([all_dfs, df])

    graph = nx.from_pandas_edgelist(df=all_dfs, source='id1', target='id2', edge_attr=True,
                                    create_using=nx.MultiDiGraph(name='Travian_Graph'))
    g_undirected = nx.from_pandas_edgelist(df=all_dfs, source='id1', target='id2', edge_attr=True,
                                           create_using=nx.Graph(name='Travian_Graph'))
    # Create negative samples ---!
    source = all_dfs['id1'].tolist()
    destination = all_dfs['id2'].tolist()
    # combine all nodes in a list
    node_list = source + destination
    # remove duplicate items from the list
    node_list = list(dict.fromkeys(node_list))
    adj_G = nx.to_numpy_matrix(graph, nodelist=node_list)
    # get unconnected node-pairs
    all_unconnected_pairs = []
    #   print(nx.non_edges(G))
    ommisible_links_data = pd.DataFrame(nx.non_edges(graph)).sample(frac=1).reset_index(drop=True)
    dates = pd.date_range('2009-12-01 00:00:00', '2009-12-31 23:59:59', periods=200000)
    gen_df = ommisible_links_data.iloc[:200000, :]
    gen_df.columns = ['id1', 'id2']
    gen_df[['id1', 'id2']] = gen_df[['id1', 'id2']].applymap(np.int64)
    gen_df['Timestamp'] = dates
    gen_df['label'] = 0
    gen_df['weight'] = 1
    gen_df['type'] = random.choices(['attacks', 'messages', 'trades'], weights=(50, 25, 25), k=200000)
    gen_df['Preferential_Attachment'] = 0
    gen_df['Resource_allocation'] = 0

    # Merge dataset with links that doesnt exist

    # print(gen_df)

    labels = {e: graph.edges[e]['type'] for e in graph.edges}
    return graph, all_dfs, labels, g_undirected, gen_df
示例#49
0
            for link_id in selected_links_id:
                selected_links.append(links[link_id])
            network_train.remove_edges_from(selected_links)
            network_test.add_edges_from(selected_links)

            #####print("network_train.number_of_edges(), network_test.number_of_edges():",network_train.number_of_edges(), network_test.number_of_edges())

            #####print("# ## Sampling negative links")
            k = 2
            n_links_train_pos = network_train.number_of_edges()
            n_links_test_pos = network_test.number_of_edges()
            n_links_train_neg = k * n_links_train_pos
            n_links_test_neg = k * n_links_test_pos

            neg_network = nx.empty_graph(network.number_of_nodes())
            links_neg = list(nx.non_edges(network))
            neg_network.add_edges_from(links_neg)

            n_links_neg = neg_network.number_of_edges()
            ######print("n_links_neg:",n_links_neg)

            selected_links_neg_id = np.random.choice(np.arange(n_links_neg),
                                                     size=n_links_train_neg +
                                                     n_links_test_neg,
                                                     replace=False)

            neg_network_train = nx.empty_graph(network.number_of_nodes())
            neg_network_test = nx.empty_graph(network.number_of_nodes())

            selected_links = []
            for i in range(n_links_train_neg):
示例#50
0
nx.draw_networkx(g)
d = g.degree()
h = pd.DataFrame(d)[1].hist()
nx.average_clustering(g)
nx.average_shortest_path_length(g)
# Connected Small World Network (run Watts up to t times till it returns a connected network)
g = nx.connected_watts_strogatz_graph(100, 6, 0.04, 50)
nx.draw_networkx(g)
# Newman Watts (adding new edges instead of rewiring)
g = nx.newman_watts_strogatz_graph(100, 6, 0.04)
nx.draw_networkx(g)

# Link Prediction
# Common Neighbors
cn = [(x[0], x[1], len(list(nx.common_neighbors(g, x[0], x[1]))))
      for x in nx.non_edges(g)]
# Jaccard Coefficient (# of common neighbors/total neighbors)
jc = list(nx.jaccard_coefficient(g))
# Resources Allocation (sum of fractions of the end node receive from middle nodes based on their degrees)
ra = list(nx.resource_allocation_index(g))
# Adamic-Adar Index (Resources Allocation with log of degrees)
aa = list(nx.adamic_adar_index(g))
# Preferential Attachment (product of nodes' degree)
pa = list(nx.preferential_attachment(g))
# Community Common Neighbors (with bonus for nieghbors in the same community)
g.nodes[0]['community'] = 0
g.nodes[1]['community'] = 1
g.nodes[2]['community'] = 0
g.nodes[3]['community'] = 1
g.nodes[4]['community'] = 1
g.nodes[5]['community'] = 0
示例#51
0
# -*- coding: utf-8 -*-

import networkx as nx
import matplotlib.pyplot as plt

G = nx.frucht_graph()

G2 = nx.Graph()
for e in nx.non_edges(G):
    G2.add_edge(*e)

plt.figure(figsize=(8, 8))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos=pos)
# nx.draw_networkx_labels(G, pos, {0: "1", 1: "2", 2: "3"})
nx.draw_networkx_edges(G2, pos=pos, edge_color="red")
nx.draw_networkx_edges(G, pos=pos, edge_color="blue")
plt.tight_layout()
plt.axis("off")
plt.savefig("schema.png")

G = nx.Graph()
G.add_edge(1, 2)
G.add_edge(1, 3)

plt.figure(figsize=(3, 3))
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos=pos)
nx.draw_networkx_labels(G, pos)
nx.draw_networkx_edges(G, pos=pos, edge_color="blue")
plt.tight_layout()
示例#52
0
# G=nx.Graph(sub_edges)
G.remove_edges_from(G.selfloop_edges())


# the one-fold of YES edges
total_edges=list(G.edges())
np.random.shuffle(total_edges)
l=int(len(total_edges)*0.7) # keep 70% graph, 30% for growth labels
# edges_0 exist only for common neighbors
edges_0,ETEs = total_edges[:l], total_edges[l:]

"""
 Use all the edges present in the network as "YES", and randomly choose equal number of "Non-existing" edges as "NO". 
"""
# Randomly choose equal sized (entire)fold of NO edges
nonETEs=random.sample(list(nx.non_edges(G)),len(ETEs))
total_edges=ETEs+nonETEs #total in consideration

xe,nxe=len(ETEs),len(nonETEs)
methods=['CN','JC','AA','RA','PA']

## 
## NOTE: HAVENT CHECKED FOR CONNECTIVITY MAINTAINED.
# len(nx.bfs_tree(G,nodelist[0]).edges())
# # extract matrix in order, and convert to dense representation
# A = nx.adjacency_matrix(G, nodelist=nodelist).todense()
N=G.number_of_nodes()
# store index for node index
nodelist = list(G.nodes())

    def split_into_train_test_sets(self, ratio, max_trial_limit=10000):

        test_set_size = int(ratio * self.number_of_edges)
        train_set_size = self.number_of_edges - test_set_size

        # Generate the positive test edges
        test_pos_samples = []
        residual_g = self.g.copy()
        num_of_ccs = nx.number_connected_components(residual_g)
        if num_of_ccs != 1:
            raise ValueError(
                "The graph contains more than one connected component!")

        num_of_pos_samples = 0

        edges = list(residual_g.edges())
        perm = np.arange(len(edges))
        np.random.shuffle(perm)
        edges = [edges[inx] for inx in perm]
        for i in range(len(edges)):

            # Remove the chosen edge
            chosen_edge = edges[i]
            residual_g.remove_edge(chosen_edge[0], chosen_edge[1])

            if chosen_edge[1] in nx.connected._plain_bfs(
                    residual_g, chosen_edge[0]):
                num_of_pos_samples += 1
                test_pos_samples.append(chosen_edge)
                print("\r{0} tp edges found out of {1}".format(
                    num_of_pos_samples, test_set_size)),
            else:
                residual_g.add_edge(chosen_edge[0], chosen_edge[1])

            if num_of_pos_samples == test_set_size:
                break

        if num_of_pos_samples != test_set_size:
            raise ValueError("Not pos edges found!")

        # Generate the negative samples
        test_neg_samples = []

        non_edges = list(nx.non_edges(self.g))
        perm = np.arange(len(non_edges))
        np.random.shuffle(perm)
        non_edges = [non_edges[inx] for inx in perm]

        chosen_non_edge_inx = np.random.choice(perm,
                                               size=test_set_size,
                                               replace=False)

        test_neg_samples = [non_edges[perm[p]] for p in chosen_non_edge_inx]
        """
        while num_of_removed_edges < test_set_size:
            # Randomly choose an edge index
            pos_inx = np.arange(residual_g.number_of_edges())
            np.random.shuffle(pos_inx)
            edge_inx = np.random.choice(a=pos_inx)
            # Remove the chosen edge
            chosen_edge = list(residual_g.edges())[edge_inx]
            residual_g.remove_edge(chosen_edge[0], chosen_edge[1])

            #reachable_from_v1 = nx.connected._plain_bfs(self.G, edge[0])
            if chosen_edge[1] in nx.connected._plain_bfs(residual_g, chosen_edge[0]):
                num_of_removed_edges += 1
                test_pos_samples.append(chosen_edge)
                trial_counter = 0
            else:
                residual_g.add_edge(chosen_edge[0], chosen_edge[1])
                trial_counter += 1

            if trial_counter == max_trial_limit:
                raise ValueError("In {} trial, any possible edge for removing could not be found!")

            print("\r{0} tp edges found out of {1}".format(num_of_removed_edges, test_set_size)),
        
        # Generate the negative samples
        test_neg_samples = []

        num_of_neg_samples = 0
        while num_of_neg_samples < test_set_size:

            pos_inx = np.arange(self.g.number_of_nodes())
            np.random.shuffle(pos_inx)
            # Self-loops are allowed
            u, v = np.random.choice(a=pos_inx, size=2)

            candiate_edge = (unicode(u), unicode(v))
            if not self.g.has_edge(candiate_edge[0], candiate_edge[1]) and candiate_edge not in self.g.edges():
                test_neg_samples.append(candiate_edge)
                num_of_neg_samples += 1

            print("\r{0} fn edges found out of {1}".format(num_of_neg_samples, test_set_size)),
        """

        return residual_g, test_pos_samples, test_neg_samples
示例#54
0
def MI(graph_file):
    G = nx.read_edgelist(graph_file)
    node_num = nx.number_of_nodes(G)
    edge_num = nx.number_of_edges(G)
    print(node_num)
    print(edge_num)
    sim_dict = {}  # �洢���ƶȵ��ֵ�

    I_pConnect_dict = {}
    pDisConnect = 1
    edges = nx.edges(G)
    ebunch = nx.non_edges(G)

    for u, v in edges:
        uDegree = nx.degree(G, u)
        vDegree = nx.degree(G, v)
        for i in range(1, vDegree + 1):
            pDisConnect = pDisConnect * (((edge_num - uDegree) - i + 1) /
                                         (edge_num - i + 1))
        pConnect = 1 - pDisConnect
        if pConnect == 0:
            I_pConnect = -math.log2(0.0001)
        else:
            I_pConnect = -math.log2(pConnect)
        I_pConnect_dict[(u, v)] = I_pConnect
        I_pConnect_dict[(v, u)] = I_pConnect
        pDisConnect = 1

    for m, n in ebunch:
        mDegree = nx.degree(G, m)
        nDegree = nx.degree(G, n)
        for i in range(1, nDegree + 1):
            pDisConnect = pDisConnect * (((edge_num - mDegree) - i + 1) /
                                         (edge_num - i + 1))
        pConnect = 1 - pDisConnect
        if pConnect == 0:
            I_pConnect = -math.log2(0.0001)
        else:
            I_pConnect = -math.log2(pConnect)
        I_pConnect_dict[(m, n)] = I_pConnect
        I_pConnect_dict[(n, m)] = I_pConnect
        pDisConnect = 1

    ebunchs = nx.non_edges(G)
    i = 0
    for u, v in ebunchs:

        pMutual_Information = 0
        I_pConnect = I_pConnect_dict[(u, v)]
        for z in nx.common_neighbors(G, u, v):
            neighbor_num = len(list(nx.neighbors(G, z)))
            neighbor_list = nx.neighbors(G, z)
            for m in range(len(neighbor_list)):
                for n in range(m + 1, len(neighbor_list)):
                    if m != n:
                        I_ppConnect = I_pConnect_dict[(neighbor_list[m],
                                                       neighbor_list[n])]
                        if nx.clustering(G, z) == 0:
                            pMutual_Information = pMutual_Information + (
                                2 /
                                (neighbor_num *
                                 (neighbor_num - 1))) * ((I_ppConnect) -
                                                         (-math.log2(0.0001)))
                        else:
                            pMutual_Information = pMutual_Information + (
                                2 / (neighbor_num * (neighbor_num - 1))) * (
                                    (I_ppConnect) -
                                    (-math.log2(nx.clustering(G, z))))
        sim_dict[(u, v)] = -(I_pConnect - pMutual_Information)
        i = i + 1
        #print(i)
        print(str(u) + "," + str(v))
        print(sim_dict[(u, v)])
    return sim_dict
示例#55
0
def new_connections_predictions():

    import operator
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, recall_score, auc, roc_curve, precision_score
    from sklearn.ensemble import GradientBoostingClassifier

    df = future_connections

    common_neigh = [(e[0], e[1], len(list(nx.common_neighbors(G, e[0], e[1]))))
                    for e in nx.non_edges(G)]
    common_neigh = sorted(common_neigh, key=operator.itemgetter(0))
    jaccard_coef = list(nx.jaccard_coefficient(G))
    jaccard_coef = sorted(jaccard_coef, key=operator.itemgetter(0))
    resource_alloc = list(nx.resource_allocation_index(G))
    resource_alloc = sorted(resource_alloc, key=operator.itemgetter(0))
    pref_attach = list(nx.preferential_attachment(G))
    pref_attach = sorted(pref_attach, key=operator.itemgetter(0))

    df["edge"] = df.index
    df = df.sort_values(by="edge")
    df = df.drop(
        ["edge"], axis=1
    )  #do not understand why these columns were showing up without them being assigned

    df["common neighbors"] = list(common_neigh)
    df["common neighbors"] = df["common neighbors"].apply(lambda x: x[2])
    df["jaccard"] = [x[2] for x in jaccard_coef]
    df["resource allocation"] = [x[2] for x in resource_alloc]
    df["preferential attachment"] = [x[2] for x in pref_attach]

    #Separate the data with future connection reported from the rows where no data is reported
    conn_data = df.dropna()
    no_conn_data = df[df["Future Connection"].isnull()]

    x = conn_data.drop(["Future Connection"], axis=1)
    y = conn_data["Future Connection"]
    test_df = no_conn_data.drop(["Future Connection"], axis=1)

    #print (df)

    #Training the gradient boosting model
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        train_size=0.9,
                                                        random_state=0)
    gbm = GradientBoostingClassifier(random_state=0,
                                     learning_rate=0.1,
                                     n_estimators=45,
                                     max_depth=5).fit(X_train, y_train)
    y_score_eval = gbm.decision_function(X_test)
    y_proba_eval = gbm.predict_proba(X_test)
    y_score = gbm.decision_function(test_df)
    y_proba = gbm.predict_proba(test_df)

    fpr, tpr, _ = roc_curve(y_test, y_score_eval)
    roc_auc = auc(fpr, tpr)

    prob_edge = pd.Series(y_proba[:, 1])
    prob_edge.index = test_df.index

    return prob_edge
示例#56
0
def _apply_prediction(G, func, ebunch=None):
    if ebunch is None:
        ebunch = nx.non_edges(G)
    return ((u, v, func(u, v)) for u, v in ebunch)
示例#57
0
def WMI(G):
    #G = nx.read_edgelist(graph_file)

    edges = nx.edges(G)
    nodes = nx.nodes(G)
    beta = -math.log2(0.0001)
    sim_dict = {}

    # 得到图中所有边的权值之和
    all_weight = 0
    for u, v in edges:
        all_weight = all_weight + G.get_edge_data(u, v)['weight']
    print(all_weight)

    # 计算图中不同‘点权值’的点之间相连的互信息
    nodes_Weight_dict = {}
    weight_list = []

    # 得到每个点的“点权值”
    for v in nodes:
        node_weight = 0
        v_neighbors = nx.neighbors(G, v)
        for u in v_neighbors:
            node_weight += G.get_edge_data(u, v)['weight']
        weight_list.append(node_weight)
        nodes_Weight_dict[v] = node_weight
    #print(weight_list)
    #print(nodes_Weight_dict)

    distinct_weight_list = list(set(weight_list))
    #print(distinct_weight_list)
    size = len(distinct_weight_list)
    #print(size)

    self_Connect_dict = {}
    #得到不同‘点权值’的点之间相连的互信息
    for x in range(size):
        w_x = distinct_weight_list[x]
        for y in range(x, size):
            w_y = distinct_weight_list[y]
            p0 = 1
            (w_n, w_m) = pair(w_x, w_y)
            a = all_weight + 1
            b = all_weight - w_m + 1
            for i in range(1, int(w_n + 1)):
                p0 *= (b - i) / (a - i)
            if p0 == 1:
                self_Connect_dict[(w_n, w_m)] = beta
                #self_Connect_dict[(w_m, w_n)] = beta
            else:
                self_Connect_dict[(w_n, w_m)] = -math.log2(1 - p0)
                #self_Connect_dict[(w_m, w_n)] = -math.log2(1 - p0)
            #print (str(w_n) + "," + str(w_m))
            #print (self_Connect_dict[(w_n, w_m)])
    #print(self_Connect_dict)
    self_Conditional_dict = {}
    for z in nodes:
        w_z = nodes_Weight_dict[z]
        if w_z > 1:
            alpha = 2 / (w_z * (w_z - 1))
            cc_z = wc2.weight_clustering2(G, z)  #修改为加权聚类系数
            if cc_z == 0:
                log_c = beta
            else:
                log_c = -math.log2(cc_z)
            # end if
            s = 0
            neighbor_list = nx.neighbors(G, z)
            size = len(neighbor_list)
            for i in range(size):
                m = neighbor_list[i]
                for j in range(i + 1, size):
                    n = neighbor_list[j]
                    (k_x, k_y) = pair(nodes_Weight_dict[m],
                                      nodes_Weight_dict[n])
                    if i != j:
                        s += (self_Connect_dict[(k_x, k_y)] - log_c)
            self_Conditional_dict[z] = alpha * s
    #print(self_Conditional_dict)
    sim_dict = {}  # 存储相似度的字典
    ebunch = nx.non_edges(G)

    for x, y in ebunch:
        s = 0
        (k_x, k_y) = pair(nodes_Weight_dict[x], nodes_Weight_dict[y])
        for z in nx.common_neighbors(G, x, y):
            s += self_Conditional_dict[z]
        sim_dict[(x, y)] = s - self_Connect_dict[(k_x, k_y)]
        # end if
    # end for
    print(sim_dict)
    return sim_dict
示例#58
0
    def generate_pos_neg_links(self):
        """
        Select random existing edges in the graph to be postive links,
        and random non-edges to be negative links.

        Modify graph by removing the postive links.
        """
        # Select n edges at random (positive samples)
        n_edges = self.G.number_of_edges()
        n_nodes = self.G.number_of_nodes()
        npos = int(self.prop_pos * n_edges)
        nneg = int(self.prop_neg * n_edges)

        if not nx.is_connected(self.G):
            raise RuntimeError("Input graph is not connected")

        n_neighbors = [len(list(self.G.neighbors(v))) for v in list(self.G.nodes())]
        n_non_edges = n_nodes - 1 - np.array(n_neighbors)

        non_edges = [e for e in nx.non_edges(self.G)]
        print("Finding %d of %d non-edges" % (nneg, len(non_edges)))
        
        # Select m pairs of non-edges (negative samples)
        rnd_inx = self._rnd.choice(len(non_edges), nneg, replace=False)
        neg_edge_list = [non_edges[ii] for ii in rnd_inx]

        if len(neg_edge_list) < nneg:
            raise RuntimeWarning(
                "Only %d negative edges found" % (len(neg_edge_list))
            )

        print("Finding %d positive edges of %d total edges" % (npos, n_edges))

        # Find positive edges, and remove them.
        edges = list(self.G.edges())
        pos_edge_list = []
        n_count = 0
        n_ignored_count = 0
        rnd_inx = self._rnd.permutation(n_edges)
        for eii in rnd_inx:
            edge = edges[eii]

            # Remove edge from graph
            data = self.G[edge[0]][edge[1]]
            self.G.remove_edge(*edge)

            # Check if graph is still connected
            #TODO: We shouldn't be using a private function for bfs
            reachable_from_v1 = nx.connected._plain_bfs(self.G, edge[0])
            if edge[1] not in reachable_from_v1:
                self.G.add_edge(*edge, **data)
                n_ignored_count += 1
            else:
                pos_edge_list.append(edge)
                print("Found: %d    " % (n_count), end="\r")
                n_count += 1

            # Exit if we've found npos nodes or we have gone through the whole list
            if n_count >= npos:
                break

        if len(pos_edge_list) < npos:
            raise RuntimeWarning("Only %d positive edges found." % (n_count))

        self._pos_edge_list = pos_edge_list
        self._neg_edge_list = neg_edge_list
示例#59
0
    def to_dataframe(self, pairs=False, sampling=None, label_graph=None, cheat=False, allow_hashtags=False, min_katz=0, verbose=True, katz=None):
        """
        Get a dataframe for pairs of nodes in the graph

        :param pairs: True to consider all pairs, False to consider only non-edges, or a list of tuples to use as pairs
        :param sampling: Amount to sample (default=None, do not sample)
        :param label_graph: Graph to use to generate the true labels.  Usually the next in the time series.
        :param cheat: Do not sample when label_graph has an edge for a given pair (default=False)
        :param allow_hashtags: Also predict links between users and hashtags (default=False)
        :param min_katz: Use a katz threshold to reduce numbers of pairs
        :param verbose: Display updates (default=True)
        :param katz: Precomputed katz centrality dictionary (default=None, compute katz before generating dataframe)
        :return: A pandas dataframe containing pairs and the various calculated metrics
        """
        if not sampling:
            sampling = 2

        u = []
        v = []
        has_links = []
        jac_co = []
        adam = []
        att = []
        nbrs = []
        spl = []
        katz_centralities = []
        count = 0
        labels = []
        katzes = []
        embeddings = []
        if self.embeddings:
            for _ in self.emb_cols:
                embeddings.append([])
        # degree = nx.degree(graph)

        if type(pairs) is bool and pairs:
            iter_set = self.all_pairs()
        elif type(pairs) is bool and not pairs:
            iter_set = nx.non_edges(self.nx_graph)
        else:
            iter_set = pairs

        if verbose and not katz:
            print("Precomputing katzes....")

        if not katz:
            katz = nx.katz_centrality(self.nx_graph, alpha=.005, beta=.1, tol=.00000001, max_iter=5000)

        elim = 0
        for n1, n2 in iter_set:
            if random.random() < sampling or (cheat and label_graph and label_graph.nx_graph.has_edge(n1, n2)):
                count += 1
                if verbose:
                   if count % 10000 == 0:
                       print("%d checked... " % count)
                # k_s = np.mean((katz[n1], katz[n2]))
                #if k_s < min_katz:
                #    elim += 1
                #    continue
                u.append(n1)
                v.append(n2)
                # (jaccard, adamic, n_nbrs, attachment) = self.get_unsupported(n1, n2)
                # jac_co.append(jaccard)
                # adam.append(adamic)
                # nbrs.append(n_nbrs)
                # att.append(attachment)
                # spl.append(self.get_sp(n1, n2))
                # katz_centralities.append(np.mean((katz[n1], katz[n2])))
                labels.append(label_graph.nx_graph.has_edge(n1, n2))
                #if self.katz:
                #    katzes.append(self.katz[n1][n2])
                if self.embeddings:
                    for i in range(0, len(self.emb_cols)):
                       embeddings[i].append(np.mean((self.embeddings[n1][i], self.embeddings[n2][i])))
                    # embeddings[i].append((self.embeddings[n1][i] * self.embeddings[n2][i]))


        df = pd.DataFrame()
        df['u'] = u
        df['v'] = v
        # df['jac'] = jac_co
        # df['adam'] = adam
        # df['nbrs'] = nbrs
        # df['att'] = att
        # df['spl'] = spl
        # df['katz_centrality'] = katz_centralities
        # if self.katz:
        #     df['katz'] = katzes
        if self.embeddings:
            for i, col in enumerate(self.emb_cols):
                df[col] = embeddings[i]

        if verbose:
            print("\t%d pairs checked and %d pairs in dataframe" % (count, df.shape[0]))
        df.sample(frac=1)
        return df, labels
示例#60
0
# G.add_edge(3,5)
# G.add_edge(3,4)
# G.add_edge(3,6)
# G.add_edge(4,6)

G = nx.Graph()
G.add_edge(1, 2)
G.add_edge(1, 3)
G.add_edge(1, 4)
G.add_edge(2, 4)
G.add_edge(2, 5)
G.add_edge(5, 6)
G.add_edge(5, 7)
G.add_edge(6, 7)
G.add_edge(6, 8)
G.add_edge(7, 8)
ebunch = nx.non_edges(G)

sim_dict = {}
for u, v in ebunch:
    s = len(list(nx.common_neighbors(G, u, v)))
    sim_dict[(u, v)] = s
    print(sim_dict[(u, v)])
#MI(G)

# =============================================================================
# G = nx.read_edgelist('J:\\Python\\LinkPrediction\\Networks\\test\\test.edgelist')
# nx.draw_networkx(G)
# plt.show()
# =============================================================================