def _test_construct_graphs_hetero(): from dgl.data.csv_dataset_base import NodeData, EdgeData, DGLGraphConstructor # node_ids could be non-sorted, duplicated, not labeled from 0 to num_nodes-1 num_nodes = 100 num_edges = 1000 num_dims = 3 num_dup_nodes = int(num_nodes*0.2) ntypes = ['user', 'item'] node_data = [] node_ids_dict = {} ndata_dict = {} for ntype in ntypes: node_ids = np.random.choice( np.arange(num_nodes*2), size=num_nodes, replace=False) assert len(node_ids) == num_nodes np.random.shuffle(node_ids) node_ids = np.hstack((node_ids, node_ids[:num_dup_nodes])) t_ndata = {'feat': np.random.rand(num_nodes+num_dup_nodes, num_dims), 'label': np.random.randint(2, size=num_nodes+num_dup_nodes)} _, u_indices = np.unique(node_ids, return_index=True) ndata = {'feat': t_ndata['feat'][u_indices], 'label': t_ndata['label'][u_indices]} node_data.append(NodeData(node_ids, t_ndata, type=ntype)) node_ids_dict[ntype] = node_ids ndata_dict[ntype] = ndata etypes = [('user', 'follow', 'user'), ('user', 'like', 'item')] edge_data = [] edata_dict = {} for src_type, e_type, dst_type in etypes: src_ids = np.random.choice(node_ids_dict[src_type], size=num_edges) dst_ids = np.random.choice(node_ids_dict[dst_type], size=num_edges) edata = {'feat': np.random.rand( num_edges, num_dims), 'label': np.random.randint(2, size=num_edges)} edge_data.append(EdgeData(src_ids, dst_ids, edata, type=(src_type, e_type, dst_type))) edata_dict[(src_type, e_type, dst_type)] = edata graphs, data_dict = DGLGraphConstructor.construct_graphs( node_data, edge_data) assert len(graphs) == 1 assert len(data_dict) == 0 g = graphs[0] assert not g.is_homogeneous assert g.num_nodes() == num_nodes*len(ntypes) assert g.num_edges() == num_edges*len(etypes) def assert_data(lhs, rhs): for key, value in lhs.items(): assert key in rhs assert F.array_equal(F.tensor(value), rhs[key]) for ntype in g.ntypes: assert g.num_nodes(ntype) == num_nodes assert_data(ndata_dict[ntype], g.nodes[ntype].data) for etype in g.canonical_etypes: assert g.num_edges(etype) == num_edges assert_data(edata_dict[etype], g.edges[etype].data)
def _test_construct_graphs_homo(): from dgl.data.csv_dataset_base import NodeData, EdgeData, DGLGraphConstructor # node_ids could be non-sorted, duplicated, not labeled from 0 to num_nodes-1 num_nodes = 100 num_edges = 1000 num_dims = 3 num_dup_nodes = int(num_nodes * 0.2) node_ids = np.random.choice(np.arange(num_nodes * 2), size=num_nodes, replace=False) assert len(node_ids) == num_nodes np.random.shuffle(node_ids) node_ids = np.hstack((node_ids, node_ids[:num_dup_nodes])) t_ndata = { 'feat': np.random.rand(num_nodes + num_dup_nodes, num_dims), 'label': np.random.randint(2, size=num_nodes + num_dup_nodes) } _, u_indices = np.unique(node_ids, return_index=True) ndata = { 'feat': t_ndata['feat'][u_indices], 'label': t_ndata['label'][u_indices] } node_data = NodeData(node_ids, t_ndata) src_ids = np.random.choice(node_ids, size=num_edges) dst_ids = np.random.choice(node_ids, size=num_edges) edata = { 'feat': np.random.rand(num_edges, num_dims), 'label': np.random.randint(2, size=num_edges) } edge_data = EdgeData(src_ids, dst_ids, edata) graphs, data_dict = DGLGraphConstructor.construct_graphs( node_data, edge_data) assert len(graphs) == 1 assert len(data_dict) == 0 g = graphs[0] assert g.is_homogeneous assert g.num_nodes() == num_nodes assert g.num_edges() == num_edges def assert_data(lhs, rhs): for key, value in lhs.items(): assert key in rhs assert F.array_equal(F.tensor(value), rhs[key]) assert_data(ndata, g.ndata) assert_data(edata, g.edata)
def _test_NodeEdgeGraphData(): from dgl.data.csv_dataset_base import NodeData, EdgeData, GraphData # NodeData basics num_nodes = 100 node_ids = np.arange(num_nodes, dtype=np.float) ndata = NodeData(node_ids, {}) assert ndata.id.dtype == np.int64 assert np.array_equal(ndata.id, node_ids.astype(np.int64)) assert len(ndata.data) == 0 assert ndata.type == '_V' assert np.array_equal(ndata.graph_id, np.full(num_nodes, 0)) # NodeData more data = {'feat': np.random.rand(num_nodes, 3)} graph_id = np.arange(num_nodes) ndata = NodeData(node_ids, data, type='user', graph_id=graph_id) assert ndata.type == 'user' assert np.array_equal(ndata.graph_id, graph_id) assert len(ndata.data) == len(data) for k, v in data.items(): assert k in ndata.data assert np.array_equal(ndata.data[k], v) # NodeData except expect_except = False try: NodeData(np.arange(num_nodes), {'feat': np.random.rand( num_nodes+1, 3)}, graph_id=np.arange(num_nodes-1)) except: expect_except = True assert expect_except # EdgeData basics num_nodes = 100 num_edges = 1000 src_ids = np.random.randint(num_nodes, size=num_edges) dst_ids = np.random.randint(num_nodes, size=num_edges) edata = EdgeData(src_ids, dst_ids, {}) assert np.array_equal(edata.src, src_ids) assert np.array_equal(edata.dst, dst_ids) assert edata.type == ('_V', '_E', '_V') assert len(edata.data) == 0 assert np.array_equal(edata.graph_id, np.full(num_edges, 0)) # EdageData more src_ids = np.random.randint(num_nodes, size=num_edges).astype(np.float) dst_ids = np.random.randint(num_nodes, size=num_edges).astype(np.float) data = {'feat': np.random.rand(num_edges, 3)} etype = ('user', 'like', 'item') graph_ids = np.arange(num_edges) edata = EdgeData(src_ids, dst_ids, data, type=etype, graph_id=graph_ids) assert edata.src.dtype == np.int64 assert edata.dst.dtype == np.int64 assert np.array_equal(edata.src, src_ids) assert np.array_equal(edata.dst, dst_ids) assert edata.type == etype assert len(edata.data) == len(data) for k, v in data.items(): assert k in edata.data assert np.array_equal(edata.data[k], v) assert np.array_equal(edata.graph_id, graph_ids) # EdgeData except expect_except = False try: EdgeData(np.arange(num_edges), np.arange( num_edges+1), {'feat': np.random.rand(num_edges-1, 3)}, graph_id=np.arange(num_edges+2)) except: expect_except = True assert expect_except # GraphData basics num_graphs = 10 graph_ids = np.arange(num_graphs) gdata = GraphData(graph_ids, {}) assert np.array_equal(gdata.graph_id, graph_ids) assert len(gdata.data) == 0 # GraphData more graph_ids = np.arange(num_graphs).astype(np.float) data = {'feat': np.random.rand(num_graphs, 3)} gdata = GraphData(graph_ids, data) assert gdata.graph_id.dtype == np.int64 assert np.array_equal(gdata.graph_id, graph_ids) assert len(gdata.data) == len(data) for k, v in data.items(): assert k in gdata.data assert np.array_equal(gdata.data[k], v)
def _test_load_edge_data_from_csv(): from dgl.data.csv_dataset_base import MetaEdge, EdgeData, DefaultDataParser with tempfile.TemporaryDirectory() as test_dir: num_nodes = 100 num_edges = 1000 # minimum df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges), 'dst_id': np.random.randint(num_nodes, size=num_edges), }) csv_path = os.path.join(test_dir, 'edges.csv') df.to_csv(csv_path, index=False) meta_edge = MetaEdge(file_name=csv_path) edge_data = EdgeData.load_from_csv( meta_edge, DefaultDataParser()) assert np.array_equal(df['src_id'], edge_data.src) assert np.array_equal(df['dst_id'], edge_data.dst) assert len(edge_data.data) == 0 # common case df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges), 'dst_id': np.random.randint(num_nodes, size=num_edges), 'label': np.random.randint(3, size=num_edges)}) csv_path = os.path.join(test_dir, 'edges.csv') df.to_csv(csv_path, index=False) meta_edge = MetaEdge(file_name=csv_path) edge_data = EdgeData.load_from_csv( meta_edge, DefaultDataParser()) assert np.array_equal(df['src_id'], edge_data.src) assert np.array_equal(df['dst_id'], edge_data.dst) assert len(edge_data.data) == 1 assert np.array_equal(df['label'], edge_data.data['label']) assert np.array_equal(np.full(num_edges, 0), edge_data.graph_id) assert edge_data.type == ('_V', '_E', '_V') # add more fields into edges.csv df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges), 'dst_id': np.random.randint(num_nodes, size=num_edges), 'graph_id': np.arange(num_edges), 'feat': np.random.randint(3, size=num_edges), 'label': np.random.randint(3, size=num_edges)}) csv_path = os.path.join(test_dir, 'edges.csv') df.to_csv(csv_path, index=False) meta_edge = MetaEdge(file_name=csv_path) edge_data = EdgeData.load_from_csv( meta_edge, DefaultDataParser()) assert np.array_equal(df['src_id'], edge_data.src) assert np.array_equal(df['dst_id'], edge_data.dst) assert len(edge_data.data) == 2 assert np.array_equal(df['feat'], edge_data.data['feat']) assert np.array_equal(df['label'], edge_data.data['label']) assert np.array_equal(df['graph_id'], edge_data.graph_id) assert edge_data.type == ('_V', '_E', '_V') # required headers are missing df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges), }) csv_path = os.path.join(test_dir, 'edges.csv') df.to_csv(csv_path, index=False) meta_edge = MetaEdge(file_name=csv_path) expect_except = False try: EdgeData.load_from_csv( meta_edge, DefaultDataParser()) except DGLError: expect_except = True assert expect_except df = pd.DataFrame({'dst_id': np.random.randint(num_nodes, size=num_edges), }) csv_path = os.path.join(test_dir, 'edges.csv') df.to_csv(csv_path, index=False) meta_edge = MetaEdge(file_name=csv_path) expect_except = False try: EdgeData.load_from_csv( meta_edge, DefaultDataParser()) except DGLError: expect_except = True assert expect_except
def _test_construct_graphs_multiple(): from dgl.data.csv_dataset_base import NodeData, EdgeData, GraphData, DGLGraphConstructor num_nodes = 100 num_edges = 1000 num_graphs = 10 num_dims = 3 node_ids = np.array([], dtype=np.int) src_ids = np.array([], dtype=np.int) dst_ids = np.array([], dtype=np.int) ngraph_ids = np.array([], dtype=np.int) egraph_ids = np.array([], dtype=np.int) u_indices = np.array([], dtype=np.int) for i in range(num_graphs): l_node_ids = np.random.choice( np.arange(num_nodes*2), size=num_nodes, replace=False) node_ids = np.append(node_ids, l_node_ids) _, l_u_indices = np.unique(l_node_ids, return_index=True) u_indices = np.append(u_indices, l_u_indices) ngraph_ids = np.append(ngraph_ids, np.full(num_nodes, i)) src_ids = np.append(src_ids, np.random.choice( l_node_ids, size=num_edges)) dst_ids = np.append(dst_ids, np.random.choice( l_node_ids, size=num_edges)) egraph_ids = np.append(egraph_ids, np.full(num_edges, i)) ndata = {'feat': np.random.rand(num_nodes*num_graphs, num_dims), 'label': np.random.randint(2, size=num_nodes*num_graphs)} node_data = NodeData(node_ids, ndata, graph_id=ngraph_ids) edata = {'feat': np.random.rand( num_edges*num_graphs, num_dims), 'label': np.random.randint(2, size=num_edges*num_graphs)} edge_data = EdgeData(src_ids, dst_ids, edata, graph_id=egraph_ids) gdata = {'feat': np.random.rand(num_graphs, num_dims), 'label': np.random.randint(2, size=num_graphs)} graph_data = GraphData(np.arange(num_graphs), gdata) graphs, data_dict = DGLGraphConstructor.construct_graphs( node_data, edge_data, graph_data) assert len(graphs) == num_graphs assert len(data_dict) == len(gdata) for k, v in data_dict.items(): assert F.array_equal(F.tensor(gdata[k]), v) for i, g in enumerate(graphs): assert g.is_homogeneous assert g.num_nodes() == num_nodes assert g.num_edges() == num_edges def assert_data(lhs, rhs, size, node=False): for key, value in lhs.items(): assert key in rhs value = value[i*size:(i+1)*size] if node: indices = u_indices[i*size:(i+1)*size] value = value[indices] assert F.array_equal(F.tensor(value), rhs[key]) assert_data(ndata, g.ndata, num_nodes, node=True) assert_data(edata, g.edata, num_edges) # Graph IDs found in node/edge CSV but not in graph CSV graph_data = GraphData(np.arange(num_graphs-2), {}) expect_except = False try: _, _ = DGLGraphConstructor.construct_graphs( node_data, edge_data, graph_data) except: expect_except = True assert expect_except