def test_serialize_tensors(): # create a temporary file and immediately release it so DGL can open it. f = tempfile.NamedTemporaryFile(delete=False) path = f.name f.close() tensor_dict = { "a": F.tensor([1, 3, -1, 0], dtype=F.int64), "1@1": F.tensor([1.5, 2], dtype=F.float32) } save_tensors(path, tensor_dict) load_tensor_dict = load_tensors(path) for key in tensor_dict: assert key in load_tensor_dict assert np.array_equal(F.asnumpy(load_tensor_dict[key]), F.asnumpy(tensor_dict[key])) load_nd_dict = load_tensors(path, return_dgl_ndarray=True) for key in tensor_dict: assert key in load_nd_dict assert isinstance(load_nd_dict[key], nd.NDArray) assert np.array_equal(load_nd_dict[key].asnumpy(), F.asnumpy(tensor_dict[key])) os.unlink(path)
def save(g, dataset): print("Saving dataset..") part_dir = os.path.join("./" + dataset) node_feat_file = os.path.join(part_dir, "node_feat.dgl") part_graph_file = os.path.join(part_dir, "graph.dgl") os.makedirs(part_dir, mode=0o775, exist_ok=True) save_tensors(node_feat_file, g.ndata) save_graphs(part_graph_file, [g]) print("Graph saved successfully !!")
def test_serialize_empty_dict(): # create a temporary file and immediately release it so DGL can open it. f = tempfile.NamedTemporaryFile(delete=False) path = f.name f.close() tensor_dict = {} save_tensors(path, tensor_dict) load_tensor_dict = load_tensors(path) assert isinstance(load_tensor_dict, dict) assert len(load_tensor_dict) == 0 os.unlink(path)
def save(g, dataset): """ This function saves input dataset to dgl format Parameters ---------- g : graph to be saved dataset : output folder name """ print("Saving dataset..") part_dir = os.path.join("./" + dataset) node_feat_file = os.path.join(part_dir, "node_feat.dgl") part_graph_file = os.path.join(part_dir, "graph.dgl") os.makedirs(part_dir, mode=0o775, exist_ok=True) save_tensors(node_feat_file, g.ndata) save_graphs(part_graph_file, [g]) print("Graph saved successfully !!")
def main_libra2dgl(resultdir, dataset, nc): """ Converts the output from Libra partitioning to DGL/DistGNN graph input. It builds dictionaries to assign local IDs to nodes in the partitions as well as it build a database to keep track of the location of clone nodes in the remote partitions. Parameters ---------- resultdir : Location where partitions in dgl format are stored dataset : Dataset name nc : Number of partitions Output ------ Creates partX folder in resultdir location for each partition X Notes ----- This output is directly used as input to DistGNN """ tedges = 1615685872 ## total edges max_c = 1024 ## max partitions supported factor = 1.2 ## for pre-allocated tensor size hash_edges = [int((tedges / i) * factor) for i in range(1, max_c + 1)] ## load graph for the feature gather args = Args(dataset) print("Loading data...", flush=True) if args.dataset == 'ogbn-products': print("Loading ogbn-products") g_orig, _ = load_ogb('ogbn-products') elif args.dataset == 'ogbn-papers100M': print("Loading ogbn-papers100M") g_orig, _ = load_ogb('ogbn-papers100M') elif args.dataset == 'proteins': print("Loading proteins") g_orig = load_proteins('proteins') elif args.dataset == 'ogbn-arxiv': print("Loading ogbn-arxiv") g_orig, _ = load_ogb('ogbn-arxiv') else: g_orig = load_data(args)[0] print("Done loading data.", flush=True) a, b = g_orig.edges() N_n = g_orig.number_of_nodes() print("Number of nodes in the graph: ", N_n) node_map = th.zeros(nc, dtype=th.int32) indices = th.zeros(N_n, dtype=th.int32) lftensor = th.zeros(N_n, dtype=th.int32) gdt_key = th.zeros(N_n, dtype=th.int32) gdt_value = th.zeros([N_n, nc], dtype=th.int32) offset = th.zeros(1, dtype=th.int32) ldt_ar = [] gg = [DGLGraph() for i in range(nc)] part_nodes = [] ## Iterator over number of partitions for i in range(nc): g = gg[i] fsize = hash_edges[nc] hash_nodes = th.zeros(2, dtype=th.int32) a = th.zeros(fsize, dtype=th.int64) b = th.zeros(fsize, dtype=th.int64) ldt_key = th.zeros(fsize, dtype=th.int64) ldt_ar.append(ldt_key) ## building node, parition dictionary ## Assign local node ids and mapping to global node ids libra2dgl_build_dict(a, b, indices, ldt_key, gdt_key, gdt_value, node_map, offset, nc, i, fsize, hash_nodes, resultdir) num_nodes = int(hash_nodes[0]) num_edges = int(hash_nodes[1]) part_nodes.append(num_nodes) g.add_edges(a[0:num_edges], b[0:num_edges]) ######################################################## ## fixing lf - 1-level tree for the split-nodes libra2dgl_set_lf(gdt_key, gdt_value, lftensor, nc, N_n) ######################################################## graph_name = dataset part_method = 'Libra' num_parts = nc ## number of paritions/communities num_hops = 0 node_map_val = node_map.tolist() edge_map_val = 0 out_path = resultdir part_metadata = { 'graph_name': graph_name, 'num_nodes': g_orig.number_of_nodes(), 'num_edges': g_orig.number_of_edges(), 'part_method': part_method, 'num_parts': num_parts, 'halo_hops': num_hops, 'node_map': node_map_val, 'edge_map': edge_map_val } ############################################################ for i in range(nc): g = gg[0] num_nodes = part_nodes[i] adj = th.zeros([num_nodes, nc - 1], dtype=th.int32) inner_node = th.zeros(num_nodes, dtype=th.int32) lf = th.zeros(num_nodes, dtype=th.int32) ldt = ldt_ar[0] try: feat = g_orig.ndata['feat'] except: feat = g_orig.ndata['features'] try: labels = g_orig.ndata['label'] except: labels = g_orig.ndata['labels'] trainm = g_orig.ndata['train_mask'] testm = g_orig.ndata['test_mask'] valm = g_orig.ndata['val_mask'] feat_size = feat.shape[1] gfeat = th.zeros([num_nodes, feat_size], dtype=feat.dtype) glabels = th.zeros(num_nodes, dtype=labels.dtype) gtrainm = th.zeros(num_nodes, dtype=trainm.dtype) gtestm = th.zeros(num_nodes, dtype=testm.dtype) gvalm = th.zeros(num_nodes, dtype=valm.dtype) ## build remote node databse per local node ## gather feats, train, test, val, and labels for each partition libra2dgl_build_adjlist(feat, gfeat, adj, inner_node, ldt, gdt_key, gdt_value, node_map, lf, lftensor, num_nodes, nc, i, feat_size, labels, trainm, testm, valm, glabels, gtrainm, gtestm, gvalm, feat.shape[0]) g.ndata['adj'] = adj ## databse of remote clones g.ndata['inner_node'] = inner_node ## split node '0' else '1' g.ndata['feat'] = gfeat ## gathered features g.ndata['lf'] = lf ## 1-level tree among split nodes g.ndata['label'] = glabels g.ndata['train_mask'] = gtrainm g.ndata['test_mask'] = gtestm g.ndata['val_mask'] = gvalm lf = g.ndata['lf'] print("Writing partition {} to file".format(i), flush=True) part = g part_id = i part_dir = os.path.join(out_path, "part" + str(part_id)) node_feat_file = os.path.join(part_dir, "node_feat.dgl") edge_feat_file = os.path.join(part_dir, "edge_feat.dgl") part_graph_file = os.path.join(part_dir, "graph.dgl") part_metadata['part-{}'.format(part_id)] = { 'node_feats': node_feat_file, 'edge_feats': edge_feat_file, 'part_graph': part_graph_file } os.makedirs(part_dir, mode=0o775, exist_ok=True) save_tensors(node_feat_file, part.ndata) save_graphs(part_graph_file, [part]) del g del gg[0] del ldt del ldt_ar[0] with open('{}/{}.json'.format(out_path, graph_name), 'w') as outfile: json.dump(part_metadata, outfile, sort_keys=True, indent=4) return gg, node_map
def libra_partition(num_community, G, resultdir): """ Performs vertex-cut based graph partitioning and converts the partitioning output to DGL input format. Parameters ---------- num_community : Number of partitions to create G : Input graph to be partitioned resultdir : Output location for storing the partitioned graphs Output ------ 1. Creates X partition folder as XCommunities (say, X=2, so, 2Communities) XCommunities contains file name communityZ.txt per partition Z (Z <- 0 .. X-1); each such file contains a list of edges assigned to that partition. These files constitute the output of Libra graph partitioner (An intermediate result of this function). 2. The folder also contains partZ folders, each of these folders stores DGL/DistGNN graphs for the Z partitions; these graph files are used as input to DistGNN. 3. The folder also contains a json file which contains partitions' information. """ num_nodes = G.number_of_nodes() # number of nodes num_edges = G.number_of_edges() # number of edges print("Number of nodes in the graph: ", num_nodes) print("Number of edges in the graph: ", num_edges) in_d = G.in_degrees() out_d = G.out_degrees() node_degree = in_d + out_d edgenum_unassigned = node_degree.clone() u_t, v_t = G.edges() weight_ = th.ones(u_t.shape[0], dtype=th.int64) community_weights = th.zeros(num_community, dtype=th.int64) # self_loop = 0 # for p, q in zip(u_t, v_t): # if p == q: # self_loop += 1 # print("#self loops in the dataset: ", self_loop) # del G ## call to C/C++ code out = th.zeros(u_t.shape[0], dtype=th.int32) libra_vertex_cut(num_community, node_degree, edgenum_unassigned, community_weights, u_t, v_t, weight_, out, num_nodes, num_edges, resultdir) print("Max partition size: ", int(community_weights.max())) print(" ** Converting libra partitions to dgl graphs **") fsize = int(community_weights.max()) + 1024 ## max edges in partition # print("fsize: ", fsize, flush=True) node_map = th.zeros(num_community, dtype=th.int64) indices = th.zeros(num_nodes, dtype=th.int64) lrtensor = th.zeros(num_nodes, dtype=th.int64) gdt_key = th.zeros(num_nodes, dtype=th.int64) gdt_value = th.zeros([num_nodes, num_community], dtype=th.int64) offset = th.zeros(1, dtype=th.int64) ldt_ar = [] gg_ar = [DGLGraph() for i in range(num_community)] part_nodes = [] print(">>> ", "num_nodes ", " ", "num_edges") ## Iterator over number of partitions for i in range(num_community): g = gg_ar[i] a_t = th.zeros(fsize, dtype=th.int64) b_t = th.zeros(fsize, dtype=th.int64) ldt_key = th.zeros(fsize, dtype=th.int64) ldt_ar.append(ldt_key) ## building node, parition dictionary ## Assign local node ids and mapping to global node ids ret = libra2dgl_build_dict(a_t, b_t, indices, ldt_key, gdt_key, gdt_value, node_map, offset, num_community, i, fsize, resultdir) num_nodes_partition = int(ret[0]) num_edges_partition = int(ret[1]) part_nodes.append(num_nodes_partition) print(">>> ", num_nodes_partition, " ", num_edges_partition) g.add_edges(a_t[0:num_edges_partition], b_t[0:num_edges_partition]) ######################################################## ## fixing lr - 1-level tree for the split-nodes libra2dgl_set_lr(gdt_key, gdt_value, lrtensor, num_community, num_nodes) ######################################################## #graph_name = dataset graph_name = resultdir.split("_")[-1].split("/")[0] part_method = 'Libra' num_parts = num_community ## number of paritions/communities num_hops = 0 node_map_val = node_map.tolist() edge_map_val = 0 out_path = resultdir part_metadata = { 'graph_name': graph_name, 'num_nodes': G.number_of_nodes(), 'num_edges': G.number_of_edges(), 'part_method': part_method, 'num_parts': num_parts, 'halo_hops': num_hops, 'node_map': node_map_val, 'edge_map': edge_map_val } ############################################################ for i in range(num_community): g = gg_ar[0] num_nodes_partition = part_nodes[i] adj = th.zeros([num_nodes_partition, num_community - 1], dtype=th.int64) inner_node = th.zeros(num_nodes_partition, dtype=th.int32) lr_t = th.zeros(num_nodes_partition, dtype=th.int64) ldt = ldt_ar[0] try: feat = G.ndata['feat'] except KeyError: feat = G.ndata['features'] try: labels = G.ndata['label'] except KeyError: labels = G.ndata['labels'] trainm = G.ndata['train_mask'].int() testm = G.ndata['test_mask'].int() valm = G.ndata['val_mask'].int() feat_size = feat.shape[1] gfeat = th.zeros([num_nodes_partition, feat_size], dtype=feat.dtype) glabels = th.zeros(num_nodes_partition, dtype=labels.dtype) gtrainm = th.zeros(num_nodes_partition, dtype=trainm.dtype) gtestm = th.zeros(num_nodes_partition, dtype=testm.dtype) gvalm = th.zeros(num_nodes_partition, dtype=valm.dtype) ## build remote node databse per local node ## gather feats, train, test, val, and labels for each partition libra2dgl_build_adjlist(feat, gfeat, adj, inner_node, ldt, gdt_key, gdt_value, node_map, lr_t, lrtensor, num_nodes_partition, num_community, i, feat_size, labels, trainm, testm, valm, glabels, gtrainm, gtestm, gvalm, feat.shape[0]) g.ndata['adj'] = adj ## database of remote clones g.ndata['inner_node'] = inner_node ## split node '0' else '1' g.ndata['feat'] = gfeat ## gathered features g.ndata['lf'] = lr_t ## 1-level tree among split nodes g.ndata['label'] = glabels g.ndata['train_mask'] = gtrainm g.ndata['test_mask'] = gtestm g.ndata['val_mask'] = gvalm # Validation code, run only small graphs # for l in range(num_nodes_partition): # index = int(ldt[l]) # assert glabels[l] == labels[index] # assert gtrainm[l] == trainm[index] # assert gtestm[l] == testm[index] # for j in range(feat_size): # assert gfeat[l][j] == feat[index][j] print("Writing partition {} to file".format(i), flush=True) part = g part_id = i part_dir = os.path.join(out_path, "part" + str(part_id)) node_feat_file = os.path.join(part_dir, "node_feat.dgl") edge_feat_file = os.path.join(part_dir, "edge_feat.dgl") part_graph_file = os.path.join(part_dir, "graph.dgl") part_metadata['part-{}'.format(part_id)] = { 'node_feats': node_feat_file, 'edge_feats': edge_feat_file, 'part_graph': part_graph_file } os.makedirs(part_dir, mode=0o775, exist_ok=True) save_tensors(node_feat_file, part.ndata) save_graphs(part_graph_file, [part]) del g del gg_ar[0] del ldt del ldt_ar[0] with open('{}/{}.json'.format(out_path, graph_name), 'w') as outfile: json.dump(part_metadata, outfile, sort_keys=True, indent=4) print("Conversion libra2dgl completed !!!")