Exemplo n.º 1
0
def test_serialize_tensors():
    # create a temporary file and immediately release it so DGL can open it.
    f = tempfile.NamedTemporaryFile(delete=False)
    path = f.name
    f.close()

    tensor_dict = {
        "a": F.tensor([1, 3, -1, 0], dtype=F.int64),
        "1@1": F.tensor([1.5, 2], dtype=F.float32)
    }

    save_tensors(path, tensor_dict)

    load_tensor_dict = load_tensors(path)

    for key in tensor_dict:
        assert key in load_tensor_dict
        assert np.array_equal(F.asnumpy(load_tensor_dict[key]),
                              F.asnumpy(tensor_dict[key]))

    load_nd_dict = load_tensors(path, return_dgl_ndarray=True)

    for key in tensor_dict:
        assert key in load_nd_dict
        assert isinstance(load_nd_dict[key], nd.NDArray)
        assert np.array_equal(load_nd_dict[key].asnumpy(),
                              F.asnumpy(tensor_dict[key]))

    os.unlink(path)
Exemplo n.º 2
0
def save(g, dataset):
    print("Saving dataset..")
    part_dir = os.path.join("./" + dataset)
    node_feat_file = os.path.join(part_dir, "node_feat.dgl")
    part_graph_file = os.path.join(part_dir, "graph.dgl")
    os.makedirs(part_dir, mode=0o775, exist_ok=True)
    save_tensors(node_feat_file, g.ndata)
    save_graphs(part_graph_file, [g])
    print("Graph saved successfully !!")
Exemplo n.º 3
0
def test_serialize_empty_dict():
    # create a temporary file and immediately release it so DGL can open it.
    f = tempfile.NamedTemporaryFile(delete=False)
    path = f.name
    f.close()

    tensor_dict = {}

    save_tensors(path, tensor_dict)

    load_tensor_dict = load_tensors(path)
    assert isinstance(load_tensor_dict, dict)
    assert len(load_tensor_dict) == 0

    os.unlink(path)
Exemplo n.º 4
0
Arquivo: tools.py Projeto: yifeim/dgl
def save(g, dataset):
    """
    This function saves input dataset to dgl format
    Parameters
    ----------
    g : graph to be saved
    dataset : output folder name
    """
    print("Saving dataset..")
    part_dir = os.path.join("./" + dataset)
    node_feat_file = os.path.join(part_dir, "node_feat.dgl")
    part_graph_file = os.path.join(part_dir, "graph.dgl")
    os.makedirs(part_dir, mode=0o775, exist_ok=True)
    save_tensors(node_feat_file, g.ndata)
    save_graphs(part_graph_file, [g])
    print("Graph saved successfully !!")
Exemplo n.º 5
0
def main_libra2dgl(resultdir, dataset, nc):
    """
    Converts the output from Libra partitioning to DGL/DistGNN graph input.
    It builds dictionaries to assign local IDs to nodes in the partitions as well
    as it build a database to keep track of the location of clone nodes in the remote
    partitions.

    Parameters
    ----------
    resultdir : Location where partitions in dgl format are stored
    dataset : Dataset name
    nc : Number of partitions

    Output
    ------
    Creates partX folder in resultdir location for each partition X

    Notes
    -----
    This output is directly used as input to DistGNN
    
    """
    tedges = 1615685872  ## total edges
    max_c = 1024  ## max partitions supported
    factor = 1.2

    ## for pre-allocated tensor size
    hash_edges = [int((tedges / i) * factor) for i in range(1, max_c + 1)]

    ## load graph for the feature gather
    args = Args(dataset)

    print("Loading data...", flush=True)
    if args.dataset == 'ogbn-products':
        print("Loading ogbn-products")
        g_orig, _ = load_ogb('ogbn-products')
    elif args.dataset == 'ogbn-papers100M':
        print("Loading ogbn-papers100M")
        g_orig, _ = load_ogb('ogbn-papers100M')
    elif args.dataset == 'proteins':
        print("Loading proteins")
        g_orig = load_proteins('proteins')
    elif args.dataset == 'ogbn-arxiv':
        print("Loading ogbn-arxiv")
        g_orig, _ = load_ogb('ogbn-arxiv')
    else:
        g_orig = load_data(args)[0]

    print("Done loading data.", flush=True)
    a, b = g_orig.edges()

    N_n = g_orig.number_of_nodes()
    print("Number of nodes in the graph: ", N_n)
    node_map = th.zeros(nc, dtype=th.int32)
    indices = th.zeros(N_n, dtype=th.int32)
    lftensor = th.zeros(N_n, dtype=th.int32)
    gdt_key = th.zeros(N_n, dtype=th.int32)
    gdt_value = th.zeros([N_n, nc], dtype=th.int32)
    offset = th.zeros(1, dtype=th.int32)
    ldt_ar = []

    gg = [DGLGraph() for i in range(nc)]
    part_nodes = []

    ## Iterator over number of partitions
    for i in range(nc):
        g = gg[i]
        fsize = hash_edges[nc]

        hash_nodes = th.zeros(2, dtype=th.int32)
        a = th.zeros(fsize, dtype=th.int64)
        b = th.zeros(fsize, dtype=th.int64)
        ldt_key = th.zeros(fsize, dtype=th.int64)
        ldt_ar.append(ldt_key)

        ## building node, parition dictionary
        ## Assign local node ids and mapping to global node ids
        libra2dgl_build_dict(a, b, indices, ldt_key, gdt_key, gdt_value,
                             node_map, offset, nc, i, fsize, hash_nodes,
                             resultdir)

        num_nodes = int(hash_nodes[0])
        num_edges = int(hash_nodes[1])
        part_nodes.append(num_nodes)

        g.add_edges(a[0:num_edges], b[0:num_edges])

    ########################################################
    ## fixing lf - 1-level tree for the split-nodes
    libra2dgl_set_lf(gdt_key, gdt_value, lftensor, nc, N_n)
    ########################################################
    graph_name = dataset
    part_method = 'Libra'
    num_parts = nc  ## number of paritions/communities
    num_hops = 0
    node_map_val = node_map.tolist()
    edge_map_val = 0
    out_path = resultdir

    part_metadata = {
        'graph_name': graph_name,
        'num_nodes': g_orig.number_of_nodes(),
        'num_edges': g_orig.number_of_edges(),
        'part_method': part_method,
        'num_parts': num_parts,
        'halo_hops': num_hops,
        'node_map': node_map_val,
        'edge_map': edge_map_val
    }
    ############################################################

    for i in range(nc):
        g = gg[0]
        num_nodes = part_nodes[i]
        adj = th.zeros([num_nodes, nc - 1], dtype=th.int32)
        inner_node = th.zeros(num_nodes, dtype=th.int32)
        lf = th.zeros(num_nodes, dtype=th.int32)
        ldt = ldt_ar[0]

        try:
            feat = g_orig.ndata['feat']
        except:
            feat = g_orig.ndata['features']

        try:
            labels = g_orig.ndata['label']
        except:
            labels = g_orig.ndata['labels']

        trainm = g_orig.ndata['train_mask']
        testm = g_orig.ndata['test_mask']
        valm = g_orig.ndata['val_mask']

        feat_size = feat.shape[1]
        gfeat = th.zeros([num_nodes, feat_size], dtype=feat.dtype)

        glabels = th.zeros(num_nodes, dtype=labels.dtype)
        gtrainm = th.zeros(num_nodes, dtype=trainm.dtype)
        gtestm = th.zeros(num_nodes, dtype=testm.dtype)
        gvalm = th.zeros(num_nodes, dtype=valm.dtype)

        ## build remote node databse per local node
        ## gather feats, train, test, val, and labels for each partition
        libra2dgl_build_adjlist(feat, gfeat, adj, inner_node, ldt, gdt_key,
                                gdt_value, node_map, lf, lftensor, num_nodes,
                                nc, i, feat_size, labels, trainm, testm, valm,
                                glabels, gtrainm, gtestm, gvalm, feat.shape[0])

        g.ndata['adj'] = adj  ## databse of remote clones
        g.ndata['inner_node'] = inner_node  ## split node '0' else '1'
        g.ndata['feat'] = gfeat  ## gathered features
        g.ndata['lf'] = lf  ## 1-level tree among split nodes

        g.ndata['label'] = glabels
        g.ndata['train_mask'] = gtrainm
        g.ndata['test_mask'] = gtestm
        g.ndata['val_mask'] = gvalm

        lf = g.ndata['lf']
        print("Writing partition {} to file".format(i), flush=True)

        part = g
        part_id = i
        part_dir = os.path.join(out_path, "part" + str(part_id))
        node_feat_file = os.path.join(part_dir, "node_feat.dgl")
        edge_feat_file = os.path.join(part_dir, "edge_feat.dgl")
        part_graph_file = os.path.join(part_dir, "graph.dgl")
        part_metadata['part-{}'.format(part_id)] = {
            'node_feats': node_feat_file,
            'edge_feats': edge_feat_file,
            'part_graph': part_graph_file
        }
        os.makedirs(part_dir, mode=0o775, exist_ok=True)
        save_tensors(node_feat_file, part.ndata)
        save_graphs(part_graph_file, [part])

        del g
        del gg[0]
        del ldt
        del ldt_ar[0]

    with open('{}/{}.json'.format(out_path, graph_name), 'w') as outfile:
        json.dump(part_metadata, outfile, sort_keys=True, indent=4)

    return gg, node_map
Exemplo n.º 6
0
def libra_partition(num_community, G, resultdir):
    """
    Performs vertex-cut based graph partitioning and converts the partitioning
    output to DGL input format.

    Parameters
    ----------
    num_community : Number of partitions to create
    G : Input graph to be partitioned
    resultdir : Output location for storing the partitioned graphs

    Output
    ------
    1. Creates X partition folder as XCommunities (say, X=2, so, 2Communities)
       XCommunities contains file name communityZ.txt per partition Z (Z <- 0 .. X-1);
       each such file contains a list of edges assigned to that partition.
       These files constitute the output of Libra graph partitioner
       (An intermediate result of this function).
    2. The folder also contains partZ folders, each of these folders stores
       DGL/DistGNN graphs for the Z partitions;
       these graph files are used as input to DistGNN.
    3. The folder also contains a json file which contains partitions' information.
    """

    num_nodes = G.number_of_nodes()  # number of nodes
    num_edges = G.number_of_edges()  # number of edges
    print("Number of nodes in the graph: ", num_nodes)
    print("Number of edges in the graph: ", num_edges)

    in_d = G.in_degrees()
    out_d = G.out_degrees()
    node_degree = in_d + out_d
    edgenum_unassigned = node_degree.clone()

    u_t, v_t = G.edges()
    weight_ = th.ones(u_t.shape[0], dtype=th.int64)
    community_weights = th.zeros(num_community, dtype=th.int64)

    # self_loop = 0
    # for p, q in zip(u_t, v_t):
    #     if p == q:
    #         self_loop += 1
    # print("#self loops in the dataset: ", self_loop)

    # del G

    ## call to C/C++ code
    out = th.zeros(u_t.shape[0], dtype=th.int32)
    libra_vertex_cut(num_community, node_degree, edgenum_unassigned,
                     community_weights, u_t, v_t, weight_, out, num_nodes,
                     num_edges, resultdir)

    print("Max partition size: ", int(community_weights.max()))
    print(" ** Converting libra partitions to dgl graphs **")
    fsize = int(community_weights.max()) + 1024  ## max edges in partition
    # print("fsize: ", fsize, flush=True)

    node_map = th.zeros(num_community, dtype=th.int64)
    indices = th.zeros(num_nodes, dtype=th.int64)
    lrtensor = th.zeros(num_nodes, dtype=th.int64)
    gdt_key = th.zeros(num_nodes, dtype=th.int64)
    gdt_value = th.zeros([num_nodes, num_community], dtype=th.int64)
    offset = th.zeros(1, dtype=th.int64)
    ldt_ar = []

    gg_ar = [DGLGraph() for i in range(num_community)]
    part_nodes = []

    print(">>> ", "num_nodes   ", " ", "num_edges")
    ## Iterator over number of partitions
    for i in range(num_community):
        g = gg_ar[i]

        a_t = th.zeros(fsize, dtype=th.int64)
        b_t = th.zeros(fsize, dtype=th.int64)
        ldt_key = th.zeros(fsize, dtype=th.int64)
        ldt_ar.append(ldt_key)

        ## building node, parition dictionary
        ## Assign local node ids and mapping to global node ids
        ret = libra2dgl_build_dict(a_t, b_t, indices, ldt_key, gdt_key,
                                   gdt_value, node_map, offset, num_community,
                                   i, fsize, resultdir)

        num_nodes_partition = int(ret[0])
        num_edges_partition = int(ret[1])
        part_nodes.append(num_nodes_partition)
        print(">>> ", num_nodes_partition, " ", num_edges_partition)
        g.add_edges(a_t[0:num_edges_partition], b_t[0:num_edges_partition])

    ########################################################
    ## fixing lr - 1-level tree for the split-nodes
    libra2dgl_set_lr(gdt_key, gdt_value, lrtensor, num_community, num_nodes)
    ########################################################
    #graph_name = dataset
    graph_name = resultdir.split("_")[-1].split("/")[0]
    part_method = 'Libra'
    num_parts = num_community  ## number of paritions/communities
    num_hops = 0
    node_map_val = node_map.tolist()
    edge_map_val = 0
    out_path = resultdir

    part_metadata = {
        'graph_name': graph_name,
        'num_nodes': G.number_of_nodes(),
        'num_edges': G.number_of_edges(),
        'part_method': part_method,
        'num_parts': num_parts,
        'halo_hops': num_hops,
        'node_map': node_map_val,
        'edge_map': edge_map_val
    }
    ############################################################

    for i in range(num_community):
        g = gg_ar[0]
        num_nodes_partition = part_nodes[i]
        adj = th.zeros([num_nodes_partition, num_community - 1],
                       dtype=th.int64)
        inner_node = th.zeros(num_nodes_partition, dtype=th.int32)
        lr_t = th.zeros(num_nodes_partition, dtype=th.int64)
        ldt = ldt_ar[0]

        try:
            feat = G.ndata['feat']
        except KeyError:
            feat = G.ndata['features']

        try:
            labels = G.ndata['label']
        except KeyError:
            labels = G.ndata['labels']

        trainm = G.ndata['train_mask'].int()
        testm = G.ndata['test_mask'].int()
        valm = G.ndata['val_mask'].int()

        feat_size = feat.shape[1]
        gfeat = th.zeros([num_nodes_partition, feat_size], dtype=feat.dtype)

        glabels = th.zeros(num_nodes_partition, dtype=labels.dtype)
        gtrainm = th.zeros(num_nodes_partition, dtype=trainm.dtype)
        gtestm = th.zeros(num_nodes_partition, dtype=testm.dtype)
        gvalm = th.zeros(num_nodes_partition, dtype=valm.dtype)

        ## build remote node databse per local node
        ## gather feats, train, test, val, and labels for each partition
        libra2dgl_build_adjlist(feat, gfeat, adj, inner_node, ldt, gdt_key,
                                gdt_value, node_map, lr_t, lrtensor,
                                num_nodes_partition, num_community, i,
                                feat_size, labels, trainm, testm, valm,
                                glabels, gtrainm, gtestm, gvalm, feat.shape[0])

        g.ndata['adj'] = adj  ## database of remote clones
        g.ndata['inner_node'] = inner_node  ## split node '0' else '1'
        g.ndata['feat'] = gfeat  ## gathered features
        g.ndata['lf'] = lr_t  ## 1-level tree among split nodes

        g.ndata['label'] = glabels
        g.ndata['train_mask'] = gtrainm
        g.ndata['test_mask'] = gtestm
        g.ndata['val_mask'] = gvalm

        # Validation code, run only small graphs
        # for l in range(num_nodes_partition):
        #     index = int(ldt[l])
        #     assert glabels[l] == labels[index]
        #     assert gtrainm[l] == trainm[index]
        #     assert gtestm[l] == testm[index]
        #     for j in range(feat_size):
        #         assert gfeat[l][j] == feat[index][j]

        print("Writing partition {} to file".format(i), flush=True)

        part = g
        part_id = i
        part_dir = os.path.join(out_path, "part" + str(part_id))
        node_feat_file = os.path.join(part_dir, "node_feat.dgl")
        edge_feat_file = os.path.join(part_dir, "edge_feat.dgl")
        part_graph_file = os.path.join(part_dir, "graph.dgl")
        part_metadata['part-{}'.format(part_id)] = {
            'node_feats': node_feat_file,
            'edge_feats': edge_feat_file,
            'part_graph': part_graph_file
        }
        os.makedirs(part_dir, mode=0o775, exist_ok=True)
        save_tensors(node_feat_file, part.ndata)
        save_graphs(part_graph_file, [part])

        del g
        del gg_ar[0]
        del ldt
        del ldt_ar[0]

    with open('{}/{}.json'.format(out_path, graph_name), 'w') as outfile:
        json.dump(part_metadata, outfile, sort_keys=True, indent=4)

    print("Conversion libra2dgl completed !!!")