Пример #1
0
def _test_construct_graphs_hetero():
    from dgl.data.csv_dataset_base import NodeData, EdgeData, DGLGraphConstructor
    # node_ids could be non-sorted, duplicated, not labeled from 0 to num_nodes-1
    num_nodes = 100
    num_edges = 1000
    num_dims = 3
    num_dup_nodes = int(num_nodes*0.2)
    ntypes = ['user', 'item']
    node_data = []
    node_ids_dict = {}
    ndata_dict = {}
    for ntype in ntypes:
        node_ids = np.random.choice(
            np.arange(num_nodes*2), size=num_nodes, replace=False)
        assert len(node_ids) == num_nodes
        np.random.shuffle(node_ids)
        node_ids = np.hstack((node_ids, node_ids[:num_dup_nodes]))
        t_ndata = {'feat': np.random.rand(num_nodes+num_dup_nodes, num_dims),
                   'label': np.random.randint(2, size=num_nodes+num_dup_nodes)}
        _, u_indices = np.unique(node_ids, return_index=True)
        ndata = {'feat': t_ndata['feat'][u_indices],
                 'label': t_ndata['label'][u_indices]}
        node_data.append(NodeData(node_ids, t_ndata, type=ntype))
        node_ids_dict[ntype] = node_ids
        ndata_dict[ntype] = ndata
    etypes = [('user', 'follow', 'user'), ('user', 'like', 'item')]
    edge_data = []
    edata_dict = {}
    for src_type, e_type, dst_type in etypes:
        src_ids = np.random.choice(node_ids_dict[src_type], size=num_edges)
        dst_ids = np.random.choice(node_ids_dict[dst_type], size=num_edges)
        edata = {'feat': np.random.rand(
            num_edges, num_dims), 'label': np.random.randint(2, size=num_edges)}
        edge_data.append(EdgeData(src_ids, dst_ids, edata,
                         type=(src_type, e_type, dst_type)))
        edata_dict[(src_type, e_type, dst_type)] = edata
    graphs, data_dict = DGLGraphConstructor.construct_graphs(
        node_data, edge_data)
    assert len(graphs) == 1
    assert len(data_dict) == 0
    g = graphs[0]
    assert not g.is_homogeneous
    assert g.num_nodes() == num_nodes*len(ntypes)
    assert g.num_edges() == num_edges*len(etypes)

    def assert_data(lhs, rhs):
        for key, value in lhs.items():
            assert key in rhs
            assert F.array_equal(F.tensor(value), rhs[key])
    for ntype in g.ntypes:
        assert g.num_nodes(ntype) == num_nodes
        assert_data(ndata_dict[ntype], g.nodes[ntype].data)
    for etype in g.canonical_etypes:
        assert g.num_edges(etype) == num_edges
        assert_data(edata_dict[etype], g.edges[etype].data)
Пример #2
0
def _test_construct_graphs_homo():
    from dgl.data.csv_dataset_base import NodeData, EdgeData, DGLGraphConstructor
    # node_ids could be non-sorted, duplicated, not labeled from 0 to num_nodes-1
    num_nodes = 100
    num_edges = 1000
    num_dims = 3
    num_dup_nodes = int(num_nodes * 0.2)
    node_ids = np.random.choice(np.arange(num_nodes * 2),
                                size=num_nodes,
                                replace=False)
    assert len(node_ids) == num_nodes
    np.random.shuffle(node_ids)
    node_ids = np.hstack((node_ids, node_ids[:num_dup_nodes]))
    t_ndata = {
        'feat': np.random.rand(num_nodes + num_dup_nodes, num_dims),
        'label': np.random.randint(2, size=num_nodes + num_dup_nodes)
    }
    _, u_indices = np.unique(node_ids, return_index=True)
    ndata = {
        'feat': t_ndata['feat'][u_indices],
        'label': t_ndata['label'][u_indices]
    }
    node_data = NodeData(node_ids, t_ndata)
    src_ids = np.random.choice(node_ids, size=num_edges)
    dst_ids = np.random.choice(node_ids, size=num_edges)
    edata = {
        'feat': np.random.rand(num_edges, num_dims),
        'label': np.random.randint(2, size=num_edges)
    }
    edge_data = EdgeData(src_ids, dst_ids, edata)
    graphs, data_dict = DGLGraphConstructor.construct_graphs(
        node_data, edge_data)
    assert len(graphs) == 1
    assert len(data_dict) == 0
    g = graphs[0]
    assert g.is_homogeneous
    assert g.num_nodes() == num_nodes
    assert g.num_edges() == num_edges

    def assert_data(lhs, rhs):
        for key, value in lhs.items():
            assert key in rhs
            assert F.array_equal(F.tensor(value), rhs[key])

    assert_data(ndata, g.ndata)
    assert_data(edata, g.edata)
Пример #3
0
def _test_NodeEdgeGraphData():
    from dgl.data.csv_dataset_base import NodeData, EdgeData, GraphData
    # NodeData basics
    num_nodes = 100
    node_ids = np.arange(num_nodes, dtype=np.float)
    ndata = NodeData(node_ids, {})
    assert ndata.id.dtype == np.int64
    assert np.array_equal(ndata.id, node_ids.astype(np.int64))
    assert len(ndata.data) == 0
    assert ndata.type == '_V'
    assert np.array_equal(ndata.graph_id, np.full(num_nodes, 0))
    # NodeData more
    data = {'feat': np.random.rand(num_nodes, 3)}
    graph_id = np.arange(num_nodes)
    ndata = NodeData(node_ids, data, type='user', graph_id=graph_id)
    assert ndata.type == 'user'
    assert np.array_equal(ndata.graph_id, graph_id)
    assert len(ndata.data) == len(data)
    for k, v in data.items():
        assert k in ndata.data
        assert np.array_equal(ndata.data[k], v)
    # NodeData except
    expect_except = False
    try:
        NodeData(np.arange(num_nodes), {'feat': np.random.rand(
            num_nodes+1, 3)}, graph_id=np.arange(num_nodes-1))
    except:
        expect_except = True
    assert expect_except

    # EdgeData basics
    num_nodes = 100
    num_edges = 1000
    src_ids = np.random.randint(num_nodes, size=num_edges)
    dst_ids = np.random.randint(num_nodes, size=num_edges)
    edata = EdgeData(src_ids, dst_ids, {})
    assert np.array_equal(edata.src, src_ids)
    assert np.array_equal(edata.dst, dst_ids)
    assert edata.type == ('_V', '_E', '_V')
    assert len(edata.data) == 0
    assert np.array_equal(edata.graph_id, np.full(num_edges, 0))
    # EdageData more
    src_ids = np.random.randint(num_nodes, size=num_edges).astype(np.float)
    dst_ids = np.random.randint(num_nodes, size=num_edges).astype(np.float)
    data = {'feat': np.random.rand(num_edges, 3)}
    etype = ('user', 'like', 'item')
    graph_ids = np.arange(num_edges)
    edata = EdgeData(src_ids, dst_ids, data,
                            type=etype, graph_id=graph_ids)
    assert edata.src.dtype == np.int64
    assert edata.dst.dtype == np.int64
    assert np.array_equal(edata.src, src_ids)
    assert np.array_equal(edata.dst, dst_ids)
    assert edata.type == etype
    assert len(edata.data) == len(data)
    for k, v in data.items():
        assert k in edata.data
        assert np.array_equal(edata.data[k], v)
    assert np.array_equal(edata.graph_id, graph_ids)
    # EdgeData except
    expect_except = False
    try:
        EdgeData(np.arange(num_edges), np.arange(
            num_edges+1), {'feat': np.random.rand(num_edges-1, 3)}, graph_id=np.arange(num_edges+2))
    except:
        expect_except = True
    assert expect_except

    # GraphData basics
    num_graphs = 10
    graph_ids = np.arange(num_graphs)
    gdata = GraphData(graph_ids, {})
    assert np.array_equal(gdata.graph_id, graph_ids)
    assert len(gdata.data) == 0
    # GraphData more
    graph_ids = np.arange(num_graphs).astype(np.float)
    data = {'feat': np.random.rand(num_graphs, 3)}
    gdata = GraphData(graph_ids, data)
    assert gdata.graph_id.dtype == np.int64
    assert np.array_equal(gdata.graph_id, graph_ids)
    assert len(gdata.data) == len(data)
    for k, v in data.items():
        assert k in gdata.data
        assert np.array_equal(gdata.data[k], v)
Пример #4
0
def _test_load_edge_data_from_csv():
    from dgl.data.csv_dataset_base import MetaEdge, EdgeData, DefaultDataParser
    with tempfile.TemporaryDirectory() as test_dir:
        num_nodes = 100
        num_edges = 1000
        # minimum
        df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
                           'dst_id': np.random.randint(num_nodes, size=num_edges),
                           })
        csv_path = os.path.join(test_dir, 'edges.csv')
        df.to_csv(csv_path, index=False)
        meta_edge = MetaEdge(file_name=csv_path)
        edge_data = EdgeData.load_from_csv(
            meta_edge, DefaultDataParser())
        assert np.array_equal(df['src_id'], edge_data.src)
        assert np.array_equal(df['dst_id'], edge_data.dst)
        assert len(edge_data.data) == 0

        # common case
        df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
                           'dst_id': np.random.randint(num_nodes, size=num_edges),
                           'label': np.random.randint(3, size=num_edges)})
        csv_path = os.path.join(test_dir, 'edges.csv')
        df.to_csv(csv_path, index=False)
        meta_edge = MetaEdge(file_name=csv_path)
        edge_data = EdgeData.load_from_csv(
            meta_edge, DefaultDataParser())
        assert np.array_equal(df['src_id'], edge_data.src)
        assert np.array_equal(df['dst_id'], edge_data.dst)
        assert len(edge_data.data) == 1
        assert np.array_equal(df['label'], edge_data.data['label'])
        assert np.array_equal(np.full(num_edges, 0), edge_data.graph_id)
        assert edge_data.type == ('_V', '_E', '_V')

        # add more fields into edges.csv
        df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
                           'dst_id': np.random.randint(num_nodes, size=num_edges),
                           'graph_id': np.arange(num_edges),
                           'feat': np.random.randint(3, size=num_edges),
                           'label': np.random.randint(3, size=num_edges)})
        csv_path = os.path.join(test_dir, 'edges.csv')
        df.to_csv(csv_path, index=False)
        meta_edge = MetaEdge(file_name=csv_path)
        edge_data = EdgeData.load_from_csv(
            meta_edge, DefaultDataParser())
        assert np.array_equal(df['src_id'], edge_data.src)
        assert np.array_equal(df['dst_id'], edge_data.dst)
        assert len(edge_data.data) == 2
        assert np.array_equal(df['feat'], edge_data.data['feat'])
        assert np.array_equal(df['label'], edge_data.data['label'])
        assert np.array_equal(df['graph_id'], edge_data.graph_id)
        assert edge_data.type == ('_V', '_E', '_V')

        # required headers are missing
        df = pd.DataFrame({'src_id': np.random.randint(num_nodes, size=num_edges),
                           })
        csv_path = os.path.join(test_dir, 'edges.csv')
        df.to_csv(csv_path, index=False)
        meta_edge = MetaEdge(file_name=csv_path)
        expect_except = False
        try:
            EdgeData.load_from_csv(
                meta_edge, DefaultDataParser())
        except DGLError:
            expect_except = True
        assert expect_except
        df = pd.DataFrame({'dst_id': np.random.randint(num_nodes, size=num_edges),
                           })
        csv_path = os.path.join(test_dir, 'edges.csv')
        df.to_csv(csv_path, index=False)
        meta_edge = MetaEdge(file_name=csv_path)
        expect_except = False
        try:
            EdgeData.load_from_csv(
                meta_edge, DefaultDataParser())
        except DGLError:
            expect_except = True
        assert expect_except
Пример #5
0
def _test_construct_graphs_multiple():
    from dgl.data.csv_dataset_base import NodeData, EdgeData, GraphData, DGLGraphConstructor
    num_nodes = 100
    num_edges = 1000
    num_graphs = 10
    num_dims = 3
    node_ids = np.array([], dtype=np.int)
    src_ids = np.array([], dtype=np.int)
    dst_ids = np.array([], dtype=np.int)
    ngraph_ids = np.array([], dtype=np.int)
    egraph_ids = np.array([], dtype=np.int)
    u_indices = np.array([], dtype=np.int)
    for i in range(num_graphs):
        l_node_ids = np.random.choice(
            np.arange(num_nodes*2), size=num_nodes, replace=False)
        node_ids = np.append(node_ids, l_node_ids)
        _, l_u_indices = np.unique(l_node_ids, return_index=True)
        u_indices = np.append(u_indices, l_u_indices)
        ngraph_ids = np.append(ngraph_ids, np.full(num_nodes, i))
        src_ids = np.append(src_ids, np.random.choice(
            l_node_ids, size=num_edges))
        dst_ids = np.append(dst_ids, np.random.choice(
            l_node_ids, size=num_edges))
        egraph_ids = np.append(egraph_ids, np.full(num_edges, i))
    ndata = {'feat': np.random.rand(num_nodes*num_graphs, num_dims),
             'label': np.random.randint(2, size=num_nodes*num_graphs)}
    node_data = NodeData(node_ids, ndata, graph_id=ngraph_ids)
    edata = {'feat': np.random.rand(
        num_edges*num_graphs, num_dims), 'label': np.random.randint(2, size=num_edges*num_graphs)}
    edge_data = EdgeData(src_ids, dst_ids, edata, graph_id=egraph_ids)
    gdata = {'feat': np.random.rand(num_graphs, num_dims),
             'label': np.random.randint(2, size=num_graphs)}
    graph_data = GraphData(np.arange(num_graphs), gdata)
    graphs, data_dict = DGLGraphConstructor.construct_graphs(
        node_data, edge_data, graph_data)
    assert len(graphs) == num_graphs
    assert len(data_dict) == len(gdata)
    for k, v in data_dict.items():
        assert F.array_equal(F.tensor(gdata[k]), v)
    for i, g in enumerate(graphs):
        assert g.is_homogeneous
        assert g.num_nodes() == num_nodes
        assert g.num_edges() == num_edges

        def assert_data(lhs, rhs, size, node=False):
            for key, value in lhs.items():
                assert key in rhs
                value = value[i*size:(i+1)*size]
                if node:
                    indices = u_indices[i*size:(i+1)*size]
                    value = value[indices]
                assert F.array_equal(F.tensor(value), rhs[key])
        assert_data(ndata, g.ndata, num_nodes, node=True)
        assert_data(edata, g.edata, num_edges)

    # Graph IDs found in node/edge CSV but not in graph CSV
    graph_data = GraphData(np.arange(num_graphs-2), {})
    expect_except = False
    try:
        _, _ = DGLGraphConstructor.construct_graphs(
            node_data, edge_data, graph_data)
    except:
        expect_except = True
    assert expect_except