def load_graph(data_path): """Load train graph, test graph and sampled graph""" train_graph = ds.GraphData(data_path + "/train_mr", num_parallel_workers=8) test_graph = ds.GraphData(data_path + "/test_mr", num_parallel_workers=8) sampled_graph_list = [] for i in range(0, 5): sampled_graph = ds.GraphData(data_path + "/sampled" + str(i) + "_mr", num_parallel_workers=8) sampled_graph_list.append(sampled_graph) return train_graph, test_graph, sampled_graph_list
def test_graphdata_generatordataset(): """ Test generator dataset """ logger.info('test generator dataset.\n') g = ds.GraphData(DATASET_FILE) batch_num = 2 edge_num = g.graph_info()['edge_num'][0] out_column_names = [ "neighbors", "neg_neighbors", "neighbors_features", "neg_neighbors_features" ] dataset = ds.GeneratorDataset(source=GNNGraphDataset(g, batch_num), column_names=out_column_names, sampler=RandomBatchedSampler( edge_num, batch_num), num_parallel_workers=4) dataset = dataset.repeat(2) itr = dataset.create_dict_iterator(num_epochs=1, output_numpy=True) i = 0 for data in itr: assert data['neighbors'].shape == (2, 7) assert data['neg_neighbors'].shape == (6, 7) assert data['neighbors_features'].shape == (2, 7) assert data['neg_neighbors_features'].shape == (6, 7) i += 1 assert i == 40
def get_adj_features_labels(data_dir): """Get adjacency matrix, node features and labels from dataset.""" g = ds.GraphData(data_dir) nodes = g.get_all_nodes(0) nodes_list = nodes.tolist() row_tensor = g.get_node_feature(nodes_list, [1, 2]) features = row_tensor[0] labels = row_tensor[1] nodes_num = labels.shape[0] class_num = labels.max() + 1 labels_onehot = np.eye(nodes_num, class_num)[labels].astype(np.float32) neighbor = g.get_all_neighbors(nodes_list, 0) node_map = {node_id: index for index, node_id in enumerate(nodes_list)} adj = np.zeros([nodes_num, nodes_num], dtype=np.float32) for index, value in np.ndenumerate(neighbor): # The first column of neighbor is node_id, second column to last column are neighbors of the first column. # So we only care index[1] > 1. # If the node does not have that many neighbors, -1 is padded. So if value < 0, we will not deal with it. if value >= 0 and index[1] > 0: adj[node_map[neighbor[index[0], 0]], node_map[value]] = 1 adj = sp.coo_matrix(adj) adj = adj + adj.T.multiply(adj.T > adj) + sp.eye(nodes_num) nor_adj = normalize_adj(adj) nor_adj = np.array(nor_adj.todense()) return nor_adj, features, labels_onehot, labels
def get_biases_features_labels(data_dir): """ 得到邻接矩阵的偏差和各节点的特征、标签 """ # 读入以处理好的图数据 g = ds.GraphData(data_dir) # 获取该图的所有结点 nodes = g.get_all_nodes(0) nodes_list = nodes.tolist() # 对改图的所有结点进行邻居结点的采样,获得邻居结点特征 row_tensor = g.get_node_feature(nodes_list, [1, 2]) features = row_tensor[0] # 对特征向量进行增维操作,适应GAT算法 features = features[np.newaxis] labels = row_tensor[1] nodes_num = labels.shape[0] class_num = labels.max() + 1 # 将各结点标签改为onehot形式 labels_onehot = np.eye(nodes_num, class_num)[labels].astype(np.float32) # 获取邻接矩阵 neighbor = g.get_all_neighbors(nodes_list, 0) node_map = {node_id: index for index, node_id in enumerate(nodes_list)} adj = np.zeros([nodes_num, nodes_num], dtype=np.float32) for index, value in np.ndenumerate(neighbor): if value >= 0 and index[1] > 0: adj[node_map[neighbor[index[0], 0]], node_map[value]] = 1 adj = adj[np.newaxis] # 获得邻接矩阵偏差 biases = adj_to_bias(adj) return biases, features, labels_onehot
def get_features_labels_mask(data_dir, train_nodes_num, eval_nodes_num, test_nodes_num): """ 得到各节点的特征、标签,训练集、验证集、测试集 """ # 读入以处理好的图数据 g = ds.GraphData(data_dir) # 获取该图的所有结点 nodes = g.get_all_nodes(0) # 对该图的所有结点进行邻居结点的采样,这里采用多层采样 nodes_and_neighbors = g.get_sampled_neighbors(nodes.tolist(), [10, 10], [0, 0]).tolist() # 获得所有邻居结点的特征 row_tensor = g.get_node_feature(nodes_and_neighbors, [1, 2]) features = row_tensor[0] labels = row_tensor[1] nodes_num = labels.shape[0] train_mask = get_mask(nodes_num, 0, train_nodes_num) eval_mask = get_mask(nodes_num, train_nodes_num, train_nodes_num + eval_nodes_num) test_mask = get_mask(nodes_num, nodes_num - test_nodes_num, nodes_num) class_num = labels.max() + 1 return features, labels, train_mask, test_mask, eval_mask
def get_biases_features_labels(data_dir): """Get biases, features, labels from Dataset""" g = ds.GraphData(data_dir) nodes = g.get_all_nodes(0) nodes_list = nodes.tolist() row_tensor = g.get_node_feature(nodes_list, [1, 2]) features = row_tensor[0] features = features[np.newaxis] labels = row_tensor[1] nodes_num = labels.shape[0] class_num = labels.max() + 1 labels_onehot = np.eye(nodes_num, class_num)[labels].astype(np.float32) neighbor = g.get_all_neighbors(nodes_list, 0) node_map = {node_id: index for index, node_id in enumerate(nodes_list)} adj = np.zeros([nodes_num, nodes_num], dtype=np.float32) for index, value in np.ndenumerate(neighbor): if value >= 0 and index[1] > 0: adj[node_map[neighbor[index[0], 0]], node_map[value]] = 1 adj = adj[np.newaxis] biases = adj_to_bias(adj) return biases, features, labels_onehot
def test_graphdata_getsampledneighbors(): g = ds.GraphData(DATASET_FILE, 1) edges = g.get_all_edges(0) nodes = g.get_nodes_from_edges(edges) assert len(nodes) == 40 neighbor = g.get_sampled_neighbors(np.unique(nodes[0:21, 0]), [2, 3], [2, 1]) assert neighbor.shape == (10, 9)
def test_graphdata_getfullneighbor(): g = ds.GraphData(DATASET_FILE, 2) nodes = g.get_all_nodes(1) assert len(nodes) == 10 neighbor = g.get_all_neighbors(nodes, 2) assert neighbor.shape == (10, 6) row_tensor = g.get_node_feature(neighbor.tolist(), [2, 3]) assert row_tensor[0].shape == (10, 6)
def test_graphdata_graphinfo(): g = ds.GraphData(DATASET_FILE, 2) graph_info = g.graph_info() assert graph_info['node_type'] == [1, 2] assert graph_info['edge_type'] == [0] assert graph_info['node_num'] == {1: 10, 2: 10} assert graph_info['edge_num'] == {0: 40} assert graph_info['node_feature_type'] == [1, 2, 3, 4] assert graph_info['edge_feature_type'] == []
def test_graphdata_randomwalk(): g = ds.GraphData(SOCIAL_DATA_FILE, 1) nodes = g.get_all_nodes(1) print(len(nodes)) assert len(nodes) == 33 meta_path = [1 for _ in range(39)] walks = g.random_walk(nodes, meta_path) assert walks.shape == (33, 40)
def test_graphdata_distributed(): """ Test distributed """ logger.info('test distributed.\n') server_port = random.randint(10000, 60000) p1 = Process(target=graphdata_startserver, args=(server_port, )) p1.start() time.sleep(2) g = ds.GraphData(DATASET_FILE, 1, 'client', port=server_port) nodes = g.get_all_nodes(1) assert nodes.tolist() == [101, 102, 103, 104, 105, 106, 107, 108, 109, 110] row_tensor = g.get_node_feature(nodes.tolist(), [1, 2, 3]) assert row_tensor[0].tolist() == [[0, 1, 0, 0, 0], [1, 0, 0, 0, 1], [0, 0, 1, 1, 0], [0, 0, 0, 0, 0], [1, 1, 0, 1, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0], [0, 0, 0, 1, 1], [0, 1, 1, 0, 0], [0, 1, 0, 1, 0]] assert row_tensor[2].tolist() == [1, 2, 3, 1, 4, 3, 5, 3, 5, 4] edges = g.get_all_edges(0) assert edges.tolist() == [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40 ] features = g.get_edge_feature(edges, [1, 2]) assert features[0].tolist() == [ 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0 ] batch_num = 2 edge_num = g.graph_info()['edge_num'][0] out_column_names = [ "neighbors", "neg_neighbors", "neighbors_features", "neg_neighbors_features" ] dataset = ds.GeneratorDataset(source=GNNGraphDataset(g, batch_num), column_names=out_column_names, sampler=RandomBatchedSampler( edge_num, batch_num), num_parallel_workers=4, python_multiprocessing=False) dataset = dataset.repeat(2) itr = dataset.create_dict_iterator() i = 0 for data in itr: assert data['neighbors'].shape == (2, 7) assert data['neg_neighbors'].shape == (6, 7) assert data['neighbors_features'].shape == (2, 7) assert data['neg_neighbors_features'].shape == (6, 7) i += 1 assert i == 40
def test_graphdata_getedgefeature(): """ Test get edge feature """ logger.info('test get_edge_feature.\n') g = ds.GraphData(DATASET_FILE) edges = g.get_all_edges(0) features = g.get_edge_feature(edges, [1, 2]) assert features[0].shape == (40, ) assert features[1].shape == (40, )
def test_graphdata_getnegsampledneighbors(): """ Test neg sampled neighbors """ logger.info('test get negative sampled neighbors.\n') g = ds.GraphData(DATASET_FILE, 2) nodes = g.get_all_nodes(1) assert len(nodes) == 10 neighbor = g.get_neg_sampled_neighbors(nodes, 5, 2) assert neighbor.shape == (10, 6)
def test_graphdata_getedgesfromnodes(): """ Test get edges from nodes """ logger.info('test get_edges_from_nodes\n') g = ds.GraphData(DATASET_FILE) nodes_pair_list = [(101, 201), (103, 207), (204, 105), (108, 208), (110, 210), (210, 110)] edges = g.get_edges_from_nodes(nodes_pair_list) assert edges.tolist() == [1, 9, 31, 17, 20, 40]
def test_graphdata_randomwalk(): """ Test random walk """ logger.info('test random walk with given parameters.\n') g = ds.GraphData(SOCIAL_DATA_FILE, 1) nodes = g.get_all_nodes(1) assert len(nodes) == 33 meta_path = [1 for _ in range(39)] walks = g.random_walk(nodes, meta_path, 2.0, 0.5, -1) assert walks.shape == (33, 40)
def test_graphdata_randomwalkdefault(): """ Test random walk defaults """ logger.info('test randomwalk with default parameters.\n') g = ds.GraphData(SOCIAL_DATA_FILE, 1) nodes = g.get_all_nodes(1) print(len(nodes)) assert len(nodes) == 33 meta_path = [1 for _ in range(39)] walks = g.random_walk(nodes, meta_path) assert walks.shape == (33, 40)
def test_graphdata_getnodefeature_input_check(): """ Test get node feature input check """ logger.info('test getnodefeature input check.\n') g = ds.GraphData(DATASET_FILE) with pytest.raises(TypeError): input_list = [1, [1, 1]] g.get_node_feature(input_list, [1]) with pytest.raises(TypeError): input_list = [[1, 1], 1] g.get_node_feature(input_list, [1]) with pytest.raises(TypeError): input_list = [[1, 1], [1, 1, 1]] g.get_node_feature(input_list, [1]) with pytest.raises(TypeError): input_list = [[1, 1, 1], [1, 1]] g.get_node_feature(input_list, [1]) with pytest.raises(TypeError): input_list = [[1, 1], [1, [1, 1]]] g.get_node_feature(input_list, [1]) with pytest.raises(TypeError): input_list = [[1, 1], [[1, 1], 1]] g.get_node_feature(input_list, [1]) with pytest.raises(TypeError): input_list = [[1, 1], [1, 1]] g.get_node_feature(input_list, 1) with pytest.raises(TypeError): input_list = [[1, 0.1], [1, 1]] g.get_node_feature(input_list, 1) with pytest.raises(TypeError): input_list = np.array([[1, 0.1], [1, 1]]) g.get_node_feature(input_list, 1) with pytest.raises(TypeError): input_list = [[1, 1], [1, 1]] g.get_node_feature(input_list, ["a"]) with pytest.raises(TypeError): input_list = [[1, 1], [1, 1]] g.get_node_feature(input_list, [1, "a"])
def test_graphdata_getsampledneighbors(): """ Test sampled neighbors """ logger.info('test get sampled neighbors.\n') g = ds.GraphData(DATASET_FILE, 1) edges = g.get_all_edges(0) nodes = g.get_nodes_from_edges(edges) assert len(nodes) == 40 neighbor = g.get_sampled_neighbors(np.unique(nodes[0:21, 0]), [2, 3], [2, 1], SamplingStrategy.RANDOM) assert neighbor.shape == (10, 9) neighbor = g.get_sampled_neighbors(np.unique(nodes[0:21, 0]), [2, 3], [2, 1], SamplingStrategy.EDGE_WEIGHT) assert neighbor.shape == (10, 9)
def get_adj_features_labels_mask(data_dir, train_nodes_num, eval_nodes_num, test_nodes_num): """ 获取邻接矩阵和节点特征、标签 """ # 读入以处理好的图数据 g = ds.GraphData(data_dir) # 获取该图的所有结点 nodes = g.get_all_nodes(0) nodes_list = nodes.tolist() # 对该图的所有结点进行邻居结点的采样,并获取节点特征 row_tensor = g.get_node_feature(nodes_list, [1, 2]) features = row_tensor[0] labels = row_tensor[1] nodes_num = labels.shape[0] train_mask = get_mask(nodes_num, 0, train_nodes_num) eval_mask = get_mask(nodes_num, train_nodes_num, train_nodes_num + eval_nodes_num) test_mask = get_mask(nodes_num, nodes_num - test_nodes_num, nodes_num) class_num = labels.max() + 1 # 将各结点标签改为onehot形式 labels_onehot = np.eye(nodes_num, class_num)[labels].astype(np.float32) # 获取邻接矩阵 neighbor = g.get_all_neighbors(nodes_list, 0) node_map = {node_id: index for index, node_id in enumerate(nodes_list)} adj = np.zeros([nodes_num, nodes_num], dtype=np.float32) for index, value in np.ndenumerate(neighbor): if value >= 0 and index[1] > 0: adj[node_map[neighbor[index[0], 0]], node_map[value]] = 1 adj = sp.coo_matrix(adj) adj = adj + adj.T.multiply(adj.T > adj) + sp.eye(nodes_num) # 对邻接矩阵进行对称归一化 nor_adj = normalize_adj(adj) nor_adj = np.array(nor_adj.todense()) return nor_adj, features, labels_onehot, labels, train_mask, test_mask, eval_mask
def test_graphdata_getnegsampledneighbors(): g = ds.GraphData(DATASET_FILE, 2) nodes = g.get_all_nodes(1) assert len(nodes) == 10 neighbor = g.get_neg_sampled_neighbors(nodes, 5, 2) assert neighbor.shape == (10, 6)
def graphdata_startserver(server_port): """ start graphdata server """ logger.info('test start server.\n') ds.GraphData(DATASET_FILE, 1, 'server', port=server_port)