Пример #1
0
def load_graph(data_path):
    """Load train graph, test graph and sampled graph"""
    train_graph = ds.GraphData(data_path + "/train_mr", num_parallel_workers=8)

    test_graph = ds.GraphData(data_path + "/test_mr", num_parallel_workers=8)

    sampled_graph_list = []
    for i in range(0, 5):
        sampled_graph = ds.GraphData(data_path + "/sampled" + str(i) + "_mr",
                                     num_parallel_workers=8)
        sampled_graph_list.append(sampled_graph)

    return train_graph, test_graph, sampled_graph_list
Пример #2
0
def test_graphdata_generatordataset():
    """
    Test generator dataset
    """
    logger.info('test generator dataset.\n')
    g = ds.GraphData(DATASET_FILE)
    batch_num = 2
    edge_num = g.graph_info()['edge_num'][0]
    out_column_names = [
        "neighbors", "neg_neighbors", "neighbors_features",
        "neg_neighbors_features"
    ]
    dataset = ds.GeneratorDataset(source=GNNGraphDataset(g, batch_num),
                                  column_names=out_column_names,
                                  sampler=RandomBatchedSampler(
                                      edge_num, batch_num),
                                  num_parallel_workers=4)
    dataset = dataset.repeat(2)
    itr = dataset.create_dict_iterator(num_epochs=1, output_numpy=True)
    i = 0
    for data in itr:
        assert data['neighbors'].shape == (2, 7)
        assert data['neg_neighbors'].shape == (6, 7)
        assert data['neighbors_features'].shape == (2, 7)
        assert data['neg_neighbors_features'].shape == (6, 7)
        i += 1
    assert i == 40
Пример #3
0
def get_adj_features_labels(data_dir):
    """Get adjacency matrix, node features and labels from dataset."""
    g = ds.GraphData(data_dir)
    nodes = g.get_all_nodes(0)
    nodes_list = nodes.tolist()
    row_tensor = g.get_node_feature(nodes_list, [1, 2])
    features = row_tensor[0]
    labels = row_tensor[1]

    nodes_num = labels.shape[0]
    class_num = labels.max() + 1
    labels_onehot = np.eye(nodes_num, class_num)[labels].astype(np.float32)

    neighbor = g.get_all_neighbors(nodes_list, 0)
    node_map = {node_id: index for index, node_id in enumerate(nodes_list)}
    adj = np.zeros([nodes_num, nodes_num], dtype=np.float32)
    for index, value in np.ndenumerate(neighbor):
        # The first column of neighbor is node_id, second column to last column are neighbors of the first column.
        # So we only care index[1] > 1.
        # If the node does not have that many neighbors, -1 is padded. So if value < 0, we will not deal with it.
        if value >= 0 and index[1] > 0:
            adj[node_map[neighbor[index[0], 0]], node_map[value]] = 1
    adj = sp.coo_matrix(adj)
    adj = adj + adj.T.multiply(adj.T > adj) + sp.eye(nodes_num)
    nor_adj = normalize_adj(adj)
    nor_adj = np.array(nor_adj.todense())
    return nor_adj, features, labels_onehot, labels
Пример #4
0
def get_biases_features_labels(data_dir):
    """
    得到邻接矩阵的偏差和各节点的特征、标签
    """
    # 读入以处理好的图数据
    g = ds.GraphData(data_dir)
    # 获取该图的所有结点
    nodes = g.get_all_nodes(0)
    nodes_list = nodes.tolist()
    # 对改图的所有结点进行邻居结点的采样,获得邻居结点特征
    row_tensor = g.get_node_feature(nodes_list, [1, 2])
    features = row_tensor[0]
    # 对特征向量进行增维操作,适应GAT算法
    features = features[np.newaxis]

    labels = row_tensor[1]

    nodes_num = labels.shape[0]
    class_num = labels.max() + 1
    # 将各结点标签改为onehot形式
    labels_onehot = np.eye(nodes_num, class_num)[labels].astype(np.float32)

    # 获取邻接矩阵
    neighbor = g.get_all_neighbors(nodes_list, 0)
    node_map = {node_id: index for index, node_id in enumerate(nodes_list)}
    adj = np.zeros([nodes_num, nodes_num], dtype=np.float32)
    for index, value in np.ndenumerate(neighbor):
        if value >= 0 and index[1] > 0:
            adj[node_map[neighbor[index[0], 0]], node_map[value]] = 1
    adj = adj[np.newaxis]

    # 获得邻接矩阵偏差
    biases = adj_to_bias(adj)

    return biases, features, labels_onehot
Пример #5
0
def get_features_labels_mask(data_dir, train_nodes_num, eval_nodes_num, test_nodes_num):
    """
    得到各节点的特征、标签,训练集、验证集、测试集
    """
    # 读入以处理好的图数据
    g = ds.GraphData(data_dir)
    # 获取该图的所有结点
    nodes = g.get_all_nodes(0)
    # 对该图的所有结点进行邻居结点的采样,这里采用多层采样
    nodes_and_neighbors = g.get_sampled_neighbors(nodes.tolist(), [10, 10], [0, 0]).tolist()
    # 获得所有邻居结点的特征
    row_tensor = g.get_node_feature(nodes_and_neighbors, [1, 2])

    features = row_tensor[0]
    labels = row_tensor[1]

    nodes_num = labels.shape[0]

    train_mask = get_mask(nodes_num, 0, train_nodes_num)
    eval_mask = get_mask(nodes_num, train_nodes_num, train_nodes_num + eval_nodes_num)
    test_mask = get_mask(nodes_num, nodes_num - test_nodes_num, nodes_num)

    class_num = labels.max() + 1

    return features, labels, train_mask, test_mask, eval_mask
Пример #6
0
def get_biases_features_labels(data_dir):
    """Get biases, features, labels from Dataset"""
    g = ds.GraphData(data_dir)
    nodes = g.get_all_nodes(0)
    nodes_list = nodes.tolist()
    row_tensor = g.get_node_feature(nodes_list, [1, 2])
    features = row_tensor[0]
    features = features[np.newaxis]

    labels = row_tensor[1]

    nodes_num = labels.shape[0]
    class_num = labels.max() + 1
    labels_onehot = np.eye(nodes_num, class_num)[labels].astype(np.float32)

    neighbor = g.get_all_neighbors(nodes_list, 0)
    node_map = {node_id: index for index, node_id in enumerate(nodes_list)}
    adj = np.zeros([nodes_num, nodes_num], dtype=np.float32)
    for index, value in np.ndenumerate(neighbor):
        if value >= 0 and index[1] > 0:
            adj[node_map[neighbor[index[0], 0]], node_map[value]] = 1
    adj = adj[np.newaxis]
    biases = adj_to_bias(adj)

    return biases, features, labels_onehot
Пример #7
0
def test_graphdata_getsampledneighbors():
    g = ds.GraphData(DATASET_FILE, 1)
    edges = g.get_all_edges(0)
    nodes = g.get_nodes_from_edges(edges)
    assert len(nodes) == 40
    neighbor = g.get_sampled_neighbors(np.unique(nodes[0:21, 0]), [2, 3],
                                       [2, 1])
    assert neighbor.shape == (10, 9)
Пример #8
0
def test_graphdata_getfullneighbor():
    g = ds.GraphData(DATASET_FILE, 2)
    nodes = g.get_all_nodes(1)
    assert len(nodes) == 10
    neighbor = g.get_all_neighbors(nodes, 2)
    assert neighbor.shape == (10, 6)
    row_tensor = g.get_node_feature(neighbor.tolist(), [2, 3])
    assert row_tensor[0].shape == (10, 6)
Пример #9
0
def test_graphdata_graphinfo():
    g = ds.GraphData(DATASET_FILE, 2)
    graph_info = g.graph_info()
    assert graph_info['node_type'] == [1, 2]
    assert graph_info['edge_type'] == [0]
    assert graph_info['node_num'] == {1: 10, 2: 10}
    assert graph_info['edge_num'] == {0: 40}
    assert graph_info['node_feature_type'] == [1, 2, 3, 4]
    assert graph_info['edge_feature_type'] == []
Пример #10
0
def test_graphdata_randomwalk():
    g = ds.GraphData(SOCIAL_DATA_FILE, 1)
    nodes = g.get_all_nodes(1)
    print(len(nodes))
    assert len(nodes) == 33

    meta_path = [1 for _ in range(39)]
    walks = g.random_walk(nodes, meta_path)
    assert walks.shape == (33, 40)
def test_graphdata_distributed():
    """
    Test distributed
    """
    logger.info('test distributed.\n')

    server_port = random.randint(10000, 60000)

    p1 = Process(target=graphdata_startserver, args=(server_port, ))
    p1.start()
    time.sleep(2)

    g = ds.GraphData(DATASET_FILE, 1, 'client', port=server_port)
    nodes = g.get_all_nodes(1)
    assert nodes.tolist() == [101, 102, 103, 104, 105, 106, 107, 108, 109, 110]
    row_tensor = g.get_node_feature(nodes.tolist(), [1, 2, 3])
    assert row_tensor[0].tolist() == [[0, 1, 0, 0, 0], [1, 0, 0, 0, 1],
                                      [0, 0, 1, 1, 0], [0, 0, 0, 0, 0],
                                      [1, 1, 0, 1, 0], [0, 0, 0, 0, 1],
                                      [0, 1, 0, 0, 0], [0, 0, 0, 1, 1],
                                      [0, 1, 1, 0, 0], [0, 1, 0, 1, 0]]
    assert row_tensor[2].tolist() == [1, 2, 3, 1, 4, 3, 5, 3, 5, 4]

    edges = g.get_all_edges(0)
    assert edges.tolist() == [
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
        21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
        39, 40
    ]
    features = g.get_edge_feature(edges, [1, 2])
    assert features[0].tolist() == [
        0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0
    ]

    batch_num = 2
    edge_num = g.graph_info()['edge_num'][0]
    out_column_names = [
        "neighbors", "neg_neighbors", "neighbors_features",
        "neg_neighbors_features"
    ]
    dataset = ds.GeneratorDataset(source=GNNGraphDataset(g, batch_num),
                                  column_names=out_column_names,
                                  sampler=RandomBatchedSampler(
                                      edge_num, batch_num),
                                  num_parallel_workers=4,
                                  python_multiprocessing=False)
    dataset = dataset.repeat(2)
    itr = dataset.create_dict_iterator()
    i = 0
    for data in itr:
        assert data['neighbors'].shape == (2, 7)
        assert data['neg_neighbors'].shape == (6, 7)
        assert data['neighbors_features'].shape == (2, 7)
        assert data['neg_neighbors_features'].shape == (6, 7)
        i += 1
    assert i == 40
Пример #12
0
def test_graphdata_getedgefeature():
    """
    Test get edge feature
    """
    logger.info('test get_edge_feature.\n')
    g = ds.GraphData(DATASET_FILE)
    edges = g.get_all_edges(0)
    features = g.get_edge_feature(edges, [1, 2])
    assert features[0].shape == (40, )
    assert features[1].shape == (40, )
Пример #13
0
def test_graphdata_getnegsampledneighbors():
    """
    Test neg sampled neighbors
    """
    logger.info('test get negative sampled neighbors.\n')
    g = ds.GraphData(DATASET_FILE, 2)
    nodes = g.get_all_nodes(1)
    assert len(nodes) == 10
    neighbor = g.get_neg_sampled_neighbors(nodes, 5, 2)
    assert neighbor.shape == (10, 6)
Пример #14
0
def test_graphdata_getedgesfromnodes():
    """
    Test get edges from nodes
    """
    logger.info('test get_edges_from_nodes\n')
    g = ds.GraphData(DATASET_FILE)

    nodes_pair_list = [(101, 201), (103, 207), (204, 105), (108, 208),
                       (110, 210), (210, 110)]
    edges = g.get_edges_from_nodes(nodes_pair_list)
    assert edges.tolist() == [1, 9, 31, 17, 20, 40]
Пример #15
0
def test_graphdata_randomwalk():
    """
    Test random walk
    """
    logger.info('test random walk with given parameters.\n')
    g = ds.GraphData(SOCIAL_DATA_FILE, 1)
    nodes = g.get_all_nodes(1)
    assert len(nodes) == 33

    meta_path = [1 for _ in range(39)]
    walks = g.random_walk(nodes, meta_path, 2.0, 0.5, -1)
    assert walks.shape == (33, 40)
Пример #16
0
def test_graphdata_randomwalkdefault():
    """
    Test random walk defaults
    """
    logger.info('test randomwalk with default parameters.\n')
    g = ds.GraphData(SOCIAL_DATA_FILE, 1)
    nodes = g.get_all_nodes(1)
    print(len(nodes))
    assert len(nodes) == 33

    meta_path = [1 for _ in range(39)]
    walks = g.random_walk(nodes, meta_path)
    assert walks.shape == (33, 40)
Пример #17
0
def test_graphdata_getnodefeature_input_check():
    """
    Test get node feature input check
    """
    logger.info('test getnodefeature input check.\n')
    g = ds.GraphData(DATASET_FILE)
    with pytest.raises(TypeError):
        input_list = [1, [1, 1]]
        g.get_node_feature(input_list, [1])

    with pytest.raises(TypeError):
        input_list = [[1, 1], 1]
        g.get_node_feature(input_list, [1])

    with pytest.raises(TypeError):
        input_list = [[1, 1], [1, 1, 1]]
        g.get_node_feature(input_list, [1])

    with pytest.raises(TypeError):
        input_list = [[1, 1, 1], [1, 1]]
        g.get_node_feature(input_list, [1])

    with pytest.raises(TypeError):
        input_list = [[1, 1], [1, [1, 1]]]
        g.get_node_feature(input_list, [1])

    with pytest.raises(TypeError):
        input_list = [[1, 1], [[1, 1], 1]]
        g.get_node_feature(input_list, [1])

    with pytest.raises(TypeError):
        input_list = [[1, 1], [1, 1]]
        g.get_node_feature(input_list, 1)

    with pytest.raises(TypeError):
        input_list = [[1, 0.1], [1, 1]]
        g.get_node_feature(input_list, 1)

    with pytest.raises(TypeError):
        input_list = np.array([[1, 0.1], [1, 1]])
        g.get_node_feature(input_list, 1)

    with pytest.raises(TypeError):
        input_list = [[1, 1], [1, 1]]
        g.get_node_feature(input_list, ["a"])

    with pytest.raises(TypeError):
        input_list = [[1, 1], [1, 1]]
        g.get_node_feature(input_list, [1, "a"])
Пример #18
0
def test_graphdata_getsampledneighbors():
    """
    Test sampled neighbors
    """
    logger.info('test get sampled neighbors.\n')
    g = ds.GraphData(DATASET_FILE, 1)
    edges = g.get_all_edges(0)
    nodes = g.get_nodes_from_edges(edges)
    assert len(nodes) == 40
    neighbor = g.get_sampled_neighbors(np.unique(nodes[0:21, 0]), [2, 3],
                                       [2, 1], SamplingStrategy.RANDOM)
    assert neighbor.shape == (10, 9)
    neighbor = g.get_sampled_neighbors(np.unique(nodes[0:21, 0]), [2, 3],
                                       [2, 1], SamplingStrategy.EDGE_WEIGHT)
    assert neighbor.shape == (10, 9)
Пример #19
0
def get_adj_features_labels_mask(data_dir, train_nodes_num, eval_nodes_num,
                                 test_nodes_num):
    """
    获取邻接矩阵和节点特征、标签
    """
    # 读入以处理好的图数据
    g = ds.GraphData(data_dir)
    # 获取该图的所有结点
    nodes = g.get_all_nodes(0)
    nodes_list = nodes.tolist()
    # 对该图的所有结点进行邻居结点的采样,并获取节点特征
    row_tensor = g.get_node_feature(nodes_list, [1, 2])
    features = row_tensor[0]
    labels = row_tensor[1]

    nodes_num = labels.shape[0]

    train_mask = get_mask(nodes_num, 0, train_nodes_num)
    eval_mask = get_mask(nodes_num, train_nodes_num,
                         train_nodes_num + eval_nodes_num)
    test_mask = get_mask(nodes_num, nodes_num - test_nodes_num, nodes_num)

    class_num = labels.max() + 1
    # 将各结点标签改为onehot形式
    labels_onehot = np.eye(nodes_num, class_num)[labels].astype(np.float32)

    # 获取邻接矩阵
    neighbor = g.get_all_neighbors(nodes_list, 0)
    node_map = {node_id: index for index, node_id in enumerate(nodes_list)}
    adj = np.zeros([nodes_num, nodes_num], dtype=np.float32)
    for index, value in np.ndenumerate(neighbor):
        if value >= 0 and index[1] > 0:
            adj[node_map[neighbor[index[0], 0]], node_map[value]] = 1
    adj = sp.coo_matrix(adj)
    adj = adj + adj.T.multiply(adj.T > adj) + sp.eye(nodes_num)
    # 对邻接矩阵进行对称归一化
    nor_adj = normalize_adj(adj)
    nor_adj = np.array(nor_adj.todense())
    return nor_adj, features, labels_onehot, labels, train_mask, test_mask, eval_mask
Пример #20
0
def test_graphdata_getnegsampledneighbors():
    g = ds.GraphData(DATASET_FILE, 2)
    nodes = g.get_all_nodes(1)
    assert len(nodes) == 10
    neighbor = g.get_neg_sampled_neighbors(nodes, 5, 2)
    assert neighbor.shape == (10, 6)
def graphdata_startserver(server_port):
    """
    start graphdata server
    """
    logger.info('test start server.\n')
    ds.GraphData(DATASET_FILE, 1, 'server', port=server_port)