Пример #1
0
def get_additional_features(config, edge_index, edge_attr, args):
    data = torch.sparse.FloatTensor(edge_index, edge_attr.squeeze(1))
    features = [[] for i in range(config['n_vertex'])]

    # for i in range(data.size(0)):
    #     features.append(data[i].to_dense().cpu().tolist())

    if args.node2vec:
        cache_path = os.path.join(DATASET_DIR, args.data, 'embedding.pt')
        if not os.path.exists(cache_path) or args.overwrite_cache:
            nx_G = node2vec.read_graph(config, edge_index, edge_attr)
            G = node2vec.Node2Vec(nx_G, True, 1., 1., args.verbose)
            G.preprocess_transition_probs()
            walks = G.simulate_walks(40, 10)
            embedding = node2vec.learn_embeddings(walks)
            embeddings = []
            for i in range(config['n_vertex']):
                embeddings.append(embedding.wv[str(i)].tolist())
            torch.save(embeddings, cache_path)
        else:
            embeddings = torch.load(cache_path)

        for i in range(config['n_vertex']):
            features[i] += embeddings[i]

    return torch.tensor(features).float()
Пример #2
0
 def get_walks(self, walk_length, num_walks_per_node, p, q, workers,
               precomputed):
     if precomputed:
         self.load_walks()
     else:
         self.walks = node2vec.Node2Vec(self.graph,
                                        walk_length=walk_length,
                                        num_walks=num_walks_per_node,
                                        p=p,
                                        q=q,
                                        workers=workers).walks
         self.save_walks()
Пример #3
0
    def generate_cluster_map(self):
        n2v = node2vec.Node2Vec(self.graph,
                                dimensions=64,
                                walk_length=30,
                                num_walks=200,
                                workers=5)
        model = n2v.fit(window=10, min_count=1, batch_words=4)

        X = []
        for i in range(len(self.graph)):
            X.append(model.wv[str(i)])

        kmeans = KMeans(n_clusters=self.action_space, random_state=0).fit(X)
        self.cluster_map = {}
        for node in range(len(self.graph)):
            self.cluster_map[node] = kmeans.labels_[node]
Пример #4
0
def run_node2vec(graph, save_path):
    """
    Runs the node2vec method from Node2Vec on a given graph, saves it as a pickle

    Parameters:
    graph (Networkx graph): NetworkX graph objects
    save_path (filepath): Filepath for where to save the pickled model
    """

    #Parameter p is the propability of revisitting a node you have just seen,
    #a high value means we are less likely to backtract to it
    #Parameter q makes the random walk more biased towards nodes close to our starting node,
    # a high value makes it stay close to out start node
    graphn2v = n2v.Node2Vec(graph,
                            dimensions=50,
                            walk_length=40,
                            num_walks=50,
                            p=1,
                            q=2,
                            workers=1)

    n2vmodel = graphn2v.fit(window=10, min_count=5)

    pickle.dump(n2vmodel, open(save_path, "wb"))
Пример #5
0
def main():
    nod = [
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
        21, 22, 23, 24
    ]
    edg = [(1, 2), (1, 3), (2, 6), (3, 4), (3, 12), (4, 5), (4, 11), (5, 6),
           (5, 9), (6, 8), (7, 8), (7, 18), (8, 9), (8, 16), (9, 10), (10, 11),
           (10, 15), (10, 16), (10, 17), (11, 12), (11, 14), (12, 13),
           (13, 24), (14, 15), (14, 23), (15, 19), (15, 22), (16, 17),
           (16, 18), (17, 19), (18, 20), (19, 20), (20, 21), (20, 22),
           (21, 22), (21, 24), (22, 23), (23, 24)]
    G1 = nx.Graph()
    G1.add_nodes_from(nod)
    G1.add_edges_from(edg)
    for k in range(1000, 2000):
        for m in range(22, 1, -1):
            print(time.time())
            initil = node2vec.Node2Vec(G1,
                                       dimensions=m,
                                       walk_length=50,
                                       num_walks=60,
                                       p=2,
                                       q=0.5)
            print(time.time())
            model = initil.fit()
            print(model.wv.vectors)
            phi = []
            phi.append([])
            for i in range(24):
                phi_i = model.wv.get_vector(str(i + 1)).tolist()
                phi_i.append(1)
                phi.append(phi_i)
            np.save(file="/Users/pqh/Desktop/route/Sioux/Sioux_d" + str(m) +
                    "_" + str(k) + "_phi.npy",
                    arr=phi)
            print(time.time())
Пример #6
0
def enclosed_subgraph(g,
                      hop=1,
                      max_hop_nodes=None,
                      link_percent=1.,
                      embedding_dim=0,
                      has_feature=False,
                      inject_neg_links=True,
                      multi_process=False):
    """
    抽取封闭子图
    Args:
        g: networkx Graph
        hop: 最大hop
        max_hop_nodes: 每个hop中,最大节点数量
        link_percent: g 中用于预测的边的比例
        has_feature: 是否使用节点特征,如果为True则每个节点有属性"has_feature"
        embedding: 是否使用node2vec生成每个节点的embedding
        multi_process: 是否使用多进程(单进程,速度慢,可调试; 多进程,速度快,不可调试,Windows下不可用)
    Return: 正例子图列表, 负例子图列表
    """
    # 正例链接
    pos_links = list(g.edges)
    pos_links = rand.sample(pos_links, int(g.number_of_edges() * link_percent))
    num_pos_links = len(pos_links)

    # 负例链接
    neg_links = []
    nodes = list(g.nodes)
    i = 0
    while True:
        node1 = rand.choice(nodes)
        node2 = rand.choice(nodes)
        if node1 == node2 or g.has_edge(node1, node2):
            continue
        neg_links.append((node1, node2))
        i += 1
        if i >= num_pos_links:
            break

    # 加入节点node2vec embedding
    if embedding_dim > 0:
        print("node2vec embedding ... ...")
        if inject_neg_links:  # 是否加入neg_links
            g.add_edges_from(neg_links)
        n2v_model = nv.Node2Vec(g,
                                dimensions=embedding_dim,
                                walk_length=30,
                                num_walks=10,
                                workers=4)
        n2v_wv = n2v_model.fit().wv
        nv_dict = {
            int(n): v
            for n, v in zip(n2v_wv.index2word, n2v_wv.vectors)
        }
        if not has_feature:
            nx.set_node_attributes(g, nv_dict, 'feature')
        else:
            feat_dict = nx.get_node_attributes(G, 'feature')
            features = {
                n: np.concatenate([feat_dict[n], nv_dict[n]])
                for n in feat_dict.keys()
            }
            nx.set_node_attributes(g, features, 'feature')
        if inject_neg_links:
            g.remove_edges_from(neg_links)

    # 抽取封闭子图
    pos_sub_gs = extract_subgraph_from_links(g,
                                             pos_links,
                                             hop,
                                             max_hop_nodes,
                                             multi_process,
                                             info='pos')
    neg_sub_gs = extract_subgraph_from_links(g,
                                             neg_links,
                                             hop,
                                             max_hop_nodes,
                                             multi_process,
                                             info='neg')

    # 处理子图的 Double-Radius Node Label
    dr_label_set = set()
    for sg in pos_sub_gs:
        sg.label = [1, 0]
        dr_label_set = dr_label_set.union(
            set(nx.get_node_attributes(sg, 'dr_label').values()))
    for sg in neg_sub_gs:
        sg.label = [0, 1]
        dr_label_set = dr_label_set.union(
            set(nx.get_node_attributes(sg, 'dr_label').values()))
    dr_label_dict = {v: i for i, v in enumerate(list(dr_label_set))}
    dr_label_dim = len(dr_label_set)

    # 在节点特征中加入 Double-Radius Node
    for gs in pos_sub_gs:
        for n in gs.nodes:
            dr_l = gs.nodes[n]['dr_label']
            if has_feature or embedding_dim > 0:
                feat = gs.nodes[n]['feature']
                gs.nodes[n]['feature'] = np.concatenate(
                    [feat, onehot(dr_label_dict[dr_l],
                                  dr_label_dim)]).astype(np.float32)
            else:
                gs.nodes[n]['feature'] = np.array(
                    onehot(dr_label_dict[dr_l],
                           dr_label_dim)).astype(np.float32)
    for gs in neg_sub_gs:
        for n in gs.nodes:
            dr_l = gs.nodes[n]['dr_label']
            if has_feature or embedding_dim > 0:
                feat = gs.nodes[n]['feature']
                gs.nodes[n]['feature'] = np.concatenate(
                    [feat, onehot(dr_label_dict[dr_l],
                                  dr_label_dim)]).astype(np.float32)
            else:
                gs.nodes[n]['feature'] = np.array(
                    onehot(dr_label_dict[dr_l],
                           dr_label_dim)).astype(np.float32)
    return pos_sub_gs, neg_sub_gs
Пример #7
0
def get_feature_extractor(network, features):
    """
    Get function that extracts specified features from a pair of nodes.

    Args:
        network (object): Networkx representation of the network.
        features (list): List of names of features to extract.
    
    Returns:
        (function): Function that takes a network and two nodes (node pair) and
        computes the specified features in the form of a numpy array.
    """
    def get_feature(network, n1, n2, feature):
        """
        Get specified feature for pair of nodes n1 and n2.
        This function is used by the get_feature_extractor function.

        Args:
            network (object): Networkx representation of the network.
            n1 (str): First node in pair.
            n2 (str): Second node in pair.
            feature (str): Name of feature to extract.

        Returns:
            (float): The extracted feature.
        """

        # Extract specified feature.
        if feature == 'common-neighbors':

            # Return number of common neighbors.
            return len(
                set(network.neighbors(n1)).intersection(network.neighbors(n2)))

        elif feature == 'jaccard-coefficient':

            # Return Jaccard coefficient for the node pair.
            size_int = len(
                set(network.neighbors(n1)).intersection(network.neighbors(n2)))
            size_un = len(
                set(network.neighbors(n1)).union(network.neighbors(n2)))
            return size_int / size_un if size_un > 0.0 else 0.0

        elif feature == 'hub-promoted':

            # Return Hub-promoted index.
            size_int = len(
                set(network.neighbors(n1)).intersection(network.neighbors(n2)))
            denom = min(len(set(network.neighbors(n1))),
                        len(set(network.neighbors(n1))))
            if denom > 0:
                return size_int / denom
            else:
                return 0

        elif feature == 'adamic-adar':

            # Compute and return Adamic-Adar index.
            return np.sum([
                1 / np.log(len(set(network.neighbors(n)))) for n in set(
                    network.neighbors(n1)).intersection(network.neighbors(n2))
                if len(set(network.neighbors(n))) > 1
            ])

        elif feature == 'resource-allocation':

            # Compute and return resource-allocation index.
            return np.sum([
                1 / len(set(network.neighbors(n))) for n in set(
                    network.neighbors(n1)).intersection(network.neighbors(n2))
                if len(set(network.neighbors(n))) > 0
            ])

        elif feature == 'sorenson':

            # Compute and return Sorenson index.
            size_int = len(
                set(network.neighbors(n1)).intersection(network.neighbors(n2)))
            denom = len(set(network.neighbors(n1))) + len(
                set(network.neighbors(n1)))
            return size_int / denom if denom > 0.0 else 0.0

        elif feature == 'hub-depressed':

            # Return Hub-depressed index.
            size_int = len(
                set(network.neighbors(n1)).intersection(network.neighbors(n2)))
            denom = max(len(set(network.neighbors(n1))),
                        len(set(network.neighbors(n1))))
            if denom > 0:
                return size_int / denom
            else:
                return 0

        elif feature == 'salton':

            # Compute and return Salton index.
            size_int = len(
                set(network.neighbors(n1)).intersection(network.neighbors(n2)))
            denom = np.sqrt(
                len(set(network.neighbors(n1))) *
                len(set(network.neighbors(n1))))
            return size_int / denom if denom > 0.0 else 0.0

        elif feature == 'leicht-holme-nerman':

            # Compute and return Leicht-Holme-Nerman index.
            size_int = len(
                set(network.neighbors(n1)).intersection(network.neighbors(n2)))
            denom = len(set(network.neighbors(n1))) * len(
                set(network.neighbors(n1)))
            return size_int / denom if denom > 0.0 else 0.0

        elif feature == 'preferential-attachment':

            # Compute and return preferential-attachment index.
            return len(set(network.neighbors(n1))) * len(
                set(network.neighbors(n2)))

        elif feature == 'local-random-walk':

            # Compute Local random walk score.
            return local_random_walk(network, n1, n2, p_tran)

        elif feature == 'superposed-random-walk':

            # Compute Local random walk score.
            return sum(
                [local_random_walk(network, n1, n2, p_tran) for _ in range(5)])

        elif feature == 'simrank':

            # Return Simrank score.
            return simrank_scores[n1][n2]

        elif feature == 'same-community':

            # Return flag specifying whether the two nodes are part of
            # the same community or not.
            return int(communities[n1] == communities[n2])

        elif feature == 'community-index':

            # If nodes not part of same community, return 0.
            if communities[n1] != communities[n2]:
                return 0
            else:

                # Get community index of both nodes.
                communitiy_idx = communities[n1]

                # Compute community index.
                return m_counts[communitiy_idx] / comb(
                    n_counts[communitiy_idx], 2)

        elif feature == 'page-rank':

            # Compare PageRank scores of the nodes.
            return abs(page_rank[n1] - page_rank[n2])

        elif feature == 'node2vec':

            # Return cosine distance between embeddings (or concatenate embeddings).
            return np.hstack((n2v_model.wv[str(n1)], n2v_model.wv[str(n1)]))
            # return spatial.distance.cosine(n2v_model.wv[str(n1)], n2v_model.wv[str(n1)])

        elif feature == 'random':

            # Return random value as feature.
            return np.random.rand()
        else:
            raise ValueError('Unknown feature ' + feature)

    def feature_extractor(network, n1, n2, features):
        """
        The feature extractor function. This function is partially applied
        with the list of features and returned by the get_feature_extractor function.

        Args:
            network (object): Networkx representation of the network.
            n1 (str): First node in pair.
            n2 (str): Second node in pair.
            features (list): List of names of features to extract.
        """

        return np.hstack(
            [get_feature(network, n1, n2, feature) for feature in features])

    ### PRECOMPUTED DATA FOR WHOLE NETWORK (NEEDED FOR SOME MEASURES) ###
    if 'simrank' in features:

        # Compute simrank scores.
        simrank_scores = nx.algorithms.similarity.simrank_similarity(network)

    if 'local-random-walk' in features or 'superposed-random-walk' in features:

        # Get adjacency matrix and compute probabilities of transitions.
        adj = nx.to_scipy_sparse_matrix(network)
        p_tran = sklearn.preprocessing.normalize(adj, norm='l1', axis=0)

    if 'same-community' in features or 'community-index' in features:

        # Get communities.
        communities = community.best_partition(network, randomize=True)

        # Initialize dictionary mapping community indices to counts of links contained within them.
        m_counts = dict.fromkeys(set(communities.values()), 0)

        # Count number of nodes in each community.
        n_counts = Counter(communities.values())

        # Go over links in network.
        for edge in network.edges():

            # If link within community, add to accumulator for that community.
            if communities[edge[0]] == communities[edge[1]]:
                m_counts[communities[edge[0]]] += 1

    if 'page-rank' in features:

        # Compute PageRank of nodes
        page_rank = nx.pagerank(network)

    if 'node2vec' in features:
        import node2vec
        n2v = node2vec.Node2Vec(network,
                                dimensions=64,
                                walk_length=30,
                                num_walks=20,
                                workers=8)
        n2v_model = n2v.fit(window=10, min_count=1, batch_words=4)

    #####################################################################

    return (
        lambda network, n1, n2: feature_extractor(network, n1, n2, features))
Пример #8
0
    parser.add_argument('--min-count',type=int,default=0,
                        help='Number of min count of word. Default is 0.')
    
    parser.add_argument('--sg',type=int,default = 1,
                        help='Skip-gram/Cbow,0 is cbow and 1 is sg. Default is 1')
    
    parser.add_argument('--hs',type=int,default = 1,
                        help='use Hierarchical Softmax or not, 1 yes, 0 no. Default is 1')
    parser.set_defaults(directed=False)
    return parser.parse_args()
if __name__ == "__main__":
    args = parse_args()
    
    cora_edge = pd.read_table(cora_address+cora_cite,sep='\t',names = ['src','dst'])
    G = nx.Graph()
    node_dict = {i:v for v,i in enumerate(set(np.append(cora_edge.src.values,cora_edge.dst.values)))}
    cora_edge['src'] = cora_edge['src'].apply(lambda x: str(node_dict[x]))
    cora_edge['dst'] = cora_edge['dst'].apply(lambda x: str(node_dict[x]))
    cora_edge_list = cora_edge.values.tolist()
    def map_func(x):
        return (x[0],x[1],{'weight':1})
    cora_edge_list = list(map(map_func,cora_edge_list))

    G.add_edges_from(cora_edge_list)
    startTime = time.perf_counter()
    n2v = node2vec.Node2Vec(G,args.p,args.q)
    n2v.train(num_walks = args.num_walks ,walk_length = args.walk_length, embed_size = args.embed_size, 
              window_size = args.window_size, workers = args.workers, iter_num = args.iter, min_count = args.min_count,
             sg = args.sg, hs = args.hs)
    endTime = time.perf_counter()
    print('epoch {}: Dimension from {} to {} use {} s'.format(args.iter,len(node_dict),args.embed_size,endTime-startTime))
Пример #9
0
def node2vec_cora():
    print("NODE2VEC")
    X, A, y = data.load_data(dataset='cora')
    node2vec = nv.Node2Vec(A)
    return node2vec.train(y)