def inference(self, standalone, g, x, batch_size, device): """ Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). g : the entire graph. x : the input of entire node set. The inference code is written in a fashion that it could handle any number of nodes and layers. """ # During inference with sampling, multi-layer blocks are very inefficient because # lots of computations in the first few layers are repeated. # Therefore, we compute the representation of all nodes layer by layer. The nodes # on each layer are of course splitted in batches. # TODO: can we standardize this? nodes = dgl.distributed.node_split(np.arange(g.number_of_nodes()), g.get_partition_book(), force_even=True) y = dgl.distributed.DistTensor((g.number_of_nodes(), self.n_hidden), th.float32, 'h', persistent=True) for l, layer in enumerate(self.layers): if l == len(self.layers) - 1: y = dgl.distributed.DistTensor( (g.number_of_nodes(), self.n_classes), th.float32, 'h_last', persistent=True) sampler = NeighborSampler(g, [-1], dgl.distributed.sample_neighbors, device, load_feat=False) print('|V|={}, eval batch size: {}'.format(g.number_of_nodes(), batch_size)) # Create PyTorch DataLoader for constructing blocks dataloader = DistDataLoader(dataset=nodes, batch_size=batch_size, collate_fn=sampler.sample_blocks, shuffle=False, drop_last=False) for blocks in tqdm.tqdm(dataloader): block = blocks[0].to(device) input_nodes = block.srcdata[dgl.NID] output_nodes = block.dstdata[dgl.NID] h = x[input_nodes].to(device) h_dst = h[:block.number_of_dst_nodes()] h = layer(block, (h, h_dst)) if l != len(self.layers) - 1: h = self.activation(h) h = self.dropout(h) y[output_nodes] = h.cpu() x = y g.barrier() return y
def start_dist_dataloader(rank, tmpdir, num_server, drop_last): import dgl import torch as th dgl.distributed.initialize("mp_ip_config.txt") gpb = None disable_shared_mem = num_server > 0 if disable_shared_mem: _, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank) num_nodes_to_sample = 202 batch_size = 32 train_nid = th.arange(num_nodes_to_sample) dist_graph = DistGraph("test_mp", gpb=gpb, part_config=tmpdir / 'test_sampling.json') orig_nid = F.arange(0, dist_graph.number_of_nodes()) orig_eid = F.arange(0, dist_graph.number_of_edges()) for i in range(num_server): part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i) if 'orig_id' in part.ndata: orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id'] if 'orig_id' in part.edata: orig_eid[part.edata[dgl.EID]] = part.edata['orig_id'] # Create sampler sampler = NeighborSampler(dist_graph, [5, 10], dgl.distributed.sample_neighbors) # We need to test creating DistDataLoader multiple times. for i in range(2): # Create DataLoader for constructing blocks dataloader = DistDataLoader( dataset=train_nid.numpy(), batch_size=batch_size, collate_fn=sampler.sample_blocks, shuffle=False, drop_last=drop_last) groundtruth_g = CitationGraphDataset("cora")[0] max_nid = [] for epoch in range(2): for idx, blocks in zip(range(0, num_nodes_to_sample, batch_size), dataloader): block = blocks[-1] o_src, o_dst = block.edges() src_nodes_id = block.srcdata[dgl.NID][o_src] dst_nodes_id = block.dstdata[dgl.NID][o_dst] max_nid.append(np.max(F.asnumpy(dst_nodes_id))) src_nodes_id = orig_nid[src_nodes_id] dst_nodes_id = orig_nid[dst_nodes_id] has_edges = groundtruth_g.has_edges_between(src_nodes_id, dst_nodes_id) assert np.all(F.asnumpy(has_edges)) # assert np.all(np.unique(np.sort(F.asnumpy(dst_nodes_id))) == np.arange(idx, batch_size)) if drop_last: assert np.max(max_nid) == num_nodes_to_sample - 1 - num_nodes_to_sample % batch_size else: assert np.max(max_nid) == num_nodes_to_sample - 1 del dataloader dgl.distributed.exit_client() # this is needed since there's two test here in one process
def start_client(rank, tmpdir, disable_shared_mem, num_workers, drop_last): import dgl import torch as th os.environ['DGL_DIST_MODE'] = 'distributed' dgl.distributed.initialize("mp_ip_config.txt", num_workers=4) gpb = None if disable_shared_mem: _, _, _, gpb, _ = load_partition(tmpdir / 'test_sampling.json', rank) num_nodes_to_sample = 202 batch_size = 32 train_nid = th.arange(num_nodes_to_sample) dist_graph = DistGraph("mp_ip_config.txt", "test_mp", gpb=gpb) # Create sampler sampler = NeighborSampler(dist_graph, [5, 10], dgl.distributed.sample_neighbors) # We need to test creating DistDataLoader multiple times. for i in range(2): # Create DataLoader for constructing blocks dataloader = DistDataLoader(dataset=train_nid.numpy(), batch_size=batch_size, collate_fn=sampler.sample_blocks, shuffle=False, drop_last=drop_last) groundtruth_g = CitationGraphDataset("cora")[0] max_nid = [] for epoch in range(2): for idx, blocks in zip(range(0, num_nodes_to_sample, batch_size), dataloader): block = blocks[-1] o_src, o_dst = block.edges() src_nodes_id = block.srcdata[dgl.NID][o_src] dst_nodes_id = block.dstdata[dgl.NID][o_dst] has_edges = groundtruth_g.has_edges_between( src_nodes_id, dst_nodes_id) assert np.all(F.asnumpy(has_edges)) print(np.unique(np.sort(F.asnumpy(dst_nodes_id)))) max_nid.append(np.max(F.asnumpy(dst_nodes_id))) # assert np.all(np.unique(np.sort(F.asnumpy(dst_nodes_id))) == np.arange(idx, batch_size)) if drop_last: assert np.max( max_nid ) == num_nodes_to_sample - 1 - num_nodes_to_sample % batch_size else: assert np.max(max_nid) == num_nodes_to_sample - 1 dgl.distributed.exit_client( ) # this is needed since there's two test here in one process
def run(args, device, data): # Unpack data train_nid, val_nid, test_nid, in_feats, n_classes, g = data # Create sampler sampler = NeighborSampler(g, [int(fanout) for fanout in args.fan_out.split(',')], dgl.distributed.sample_neighbors, device) # Create DataLoader for constructing blocks dataloader = DistDataLoader( dataset=train_nid.numpy(), batch_size=args.batch_size, collate_fn=sampler.sample_blocks, shuffle=True, drop_last=False) # Define model and optimizer model = DistSAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout) model = model.to(device) if not args.standalone: if args.num_gpus == -1: model = th.nn.parallel.DistributedDataParallel(model) else: dev_id = g.rank() % args.num_gpus model = th.nn.parallel.DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) loss_fcn = nn.CrossEntropyLoss() loss_fcn = loss_fcn.to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) train_size = th.sum(g.ndata['train_mask'][0:g.number_of_nodes()]) # Training loop iter_tput = [] profiler = Profiler() if args.close_profiler == False: profiler.start() epoch = 0 for epoch in range(args.num_epochs): tic = time.time() sample_time = 0 fetch_data_time = 0 copy_dev_time = 0 forward_time = 0 backward_time = 0 update_time = 0 num_seeds = 0 num_inputs = 0 start = time.time() # Loop over the dataloader to sample the computation dependency graph as a list of # blocks. step_time = [] for step, blocks in enumerate(dataloader): tic_step = time.time() sample_time += tic_step - start # The nodes for input lies at the LHS side of the first block. # The nodes for output lies at the RHS side of the last block. start = time.time() input_nodes = blocks[0].srcdata[dgl.NID] seeds = blocks[-1].dstdata[dgl.NID] batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, "cpu") batch_labels = batch_labels.long() fetch_data_time += time.time() - start start = time.time() num_seeds += len(blocks[-1].dstdata[dgl.NID]) num_inputs += len(blocks[0].srcdata[dgl.NID]) blocks = [block.to(device) for block in blocks] batch_labels = batch_labels.to(device) batch_inputs = batch_inputs.to(device) copy_dev_time += time.time() - start # Compute loss and prediction start = time.time() batch_pred = model(blocks, batch_inputs) loss = loss_fcn(batch_pred, batch_labels) forward_end = time.time() optimizer.zero_grad() loss.backward() compute_end = time.time() forward_time += forward_end - start backward_time += compute_end - forward_end optimizer.step() update_time += time.time() - compute_end step_t = time.time() - tic_step step_time.append(step_t) iter_tput.append(len(blocks[-1].dstdata[dgl.NID]) / step_t) if step % args.log_every == 0: acc = compute_acc(batch_pred, batch_labels) gpu_mem_alloc = th.cuda.max_memory_allocated() / 1000000 if th.cuda.is_available() else 0 print('Part {} | Epoch {:05d} | Step {:05d} | Loss {:.4f} | Train Acc {:.4f} | Speed (samples/sec) {:.4f} | GPU {:.1f} MiB | time {:.3f} s'.format( g.rank(), epoch, step, loss.item(), acc.item(), np.mean(iter_tput[3:]), gpu_mem_alloc, np.sum(step_time[-args.log_every:]))) start = time.time() toc = time.time() print('Part {}, Epoch Time(s): {:.4f}, sample: {:.4f}, fetch feats: {:.4f}, copy dev: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #seeds: {}, #inputs: {}'.format( g.rank(), toc - tic, sample_time, fetch_data_time, copy_dev_time, forward_time, backward_time, update_time, num_seeds, num_inputs)) epoch += 1 if epoch % args.eval_every == 0 and epoch != 0: start = time.time() val_acc, test_acc = evaluate(model.module, g, g.ndata['features'], g.ndata['labels'], val_nid, test_nid, args.batch_size_eval, device) print('Part {}, Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}'.format(g.rank(), val_acc, test_acc, time.time() - start)) if args.close_profiler == False: profiler.stop() print(profiler.output_text(unicode=True, color=True))
def run(args, device, data): g, num_classes, train_nid, val_nid, test_nid, labels, all_val_nid, all_test_nid = data num_rels = len(g.etypes) fanouts = [int(fanout) for fanout in args.fanout.split(',')] val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(',')] sampler = NeighborSampler(g, fanouts, dgl.distributed.sample_neighbors) # Create DataLoader for constructing blocks dataloader = DistDataLoader(dataset=train_nid, batch_size=args.batch_size, collate_fn=sampler.sample_blocks, shuffle=True, drop_last=False) valid_sampler = NeighborSampler(g, val_fanouts, dgl.distributed.sample_neighbors) # Create DataLoader for constructing blocks valid_dataloader = DistDataLoader(dataset=val_nid, batch_size=args.batch_size, collate_fn=valid_sampler.sample_blocks, shuffle=False, drop_last=False) test_sampler = NeighborSampler(g, [-1] * args.n_layers, dgl.distributed.sample_neighbors) # Create DataLoader for constructing blocks test_dataloader = DistDataLoader(dataset=test_nid, batch_size=args.batch_size, collate_fn=test_sampler.sample_blocks, shuffle=False, drop_last=False) embed_layer = DistEmbedLayer(device, g, args.n_hidden, sparse_emb=args.sparse_embedding, dgl_sparse_emb=args.dgl_sparse, feat_name='feat') model = EntityClassify(device, args.n_hidden, num_classes, num_rels, num_bases=args.n_bases, num_hidden_layers=args.n_layers - 2, dropout=args.dropout, use_self_loop=args.use_self_loop, low_mem=args.low_mem, layer_norm=args.layer_norm) model = model.to(device) if not args.standalone: if args.num_gpus == -1: model = DistributedDataParallel(model) # If there are dense parameters in the embedding layer # or we use Pytorch saprse embeddings. if len(embed_layer.node_projs) > 0 or not args.dgl_sparse: embed_layer = DistributedDataParallel(embed_layer) else: dev_id = g.rank() % args.num_gpus model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) # If there are dense parameters in the embedding layer # or we use Pytorch saprse embeddings. if len(embed_layer.node_projs) > 0 or not args.dgl_sparse: embed_layer = embed_layer.to(device) embed_layer = DistributedDataParallel(embed_layer, device_ids=[dev_id], output_device=dev_id) if args.sparse_embedding: if args.dgl_sparse and args.standalone: emb_optimizer = dgl.distributed.SparseAdagrad(list( embed_layer.node_embeds.values()), lr=args.sparse_lr) print('optimize DGL sparse embedding:', embed_layer.node_embeds.keys()) elif args.dgl_sparse: emb_optimizer = dgl.distributed.SparseAdagrad(list( embed_layer.module.node_embeds.values()), lr=args.sparse_lr) print('optimize DGL sparse embedding:', embed_layer.module.node_embeds.keys()) elif args.standalone: emb_optimizer = th.optim.SparseAdam(list( embed_layer.node_embeds.parameters()), lr=args.sparse_lr) print('optimize Pytorch sparse embedding:', embed_layer.node_embeds) else: emb_optimizer = th.optim.SparseAdam(list( embed_layer.module.node_embeds.parameters()), lr=args.sparse_lr) print('optimize Pytorch sparse embedding:', embed_layer.module.node_embeds) dense_params = list(model.parameters()) if args.standalone: dense_params += list(embed_layer.node_projs.parameters()) print('optimize dense projection:', embed_layer.node_projs) else: dense_params += list(embed_layer.module.node_projs.parameters()) print('optimize dense projection:', embed_layer.module.node_projs) optimizer = th.optim.Adam(dense_params, lr=args.lr, weight_decay=args.l2norm) else: all_params = list(model.parameters()) + list(embed_layer.parameters()) optimizer = th.optim.Adam(all_params, lr=args.lr, weight_decay=args.l2norm) # training loop print("start training...") for epoch in range(args.n_epochs): tic = time.time() sample_time = 0 copy_time = 0 forward_time = 0 backward_time = 0 update_time = 0 number_train = 0 step_time = [] iter_t = [] sample_t = [] feat_copy_t = [] forward_t = [] backward_t = [] update_t = [] iter_tput = [] start = time.time() # Loop over the dataloader to sample the computation dependency graph as a list of # blocks. step_time = [] for step, sample_data in enumerate(dataloader): seeds, blocks = sample_data number_train += seeds.shape[0] tic_step = time.time() sample_time += tic_step - start sample_t.append(tic_step - start) for block in blocks: gen_norm(block) feats = embed_layer(blocks[0].srcdata[dgl.NID], blocks[0].srcdata[dgl.NTYPE]) label = labels[seeds].to(device) copy_time = time.time() feat_copy_t.append(copy_time - tic_step) # forward logits = model(blocks, feats) loss = F.cross_entropy(logits, label) forward_end = time.time() # backward optimizer.zero_grad() if args.sparse_embedding: emb_optimizer.zero_grad() loss.backward() compute_end = time.time() forward_t.append(forward_end - copy_time) backward_t.append(compute_end - forward_end) # Update model parameters optimizer.step() if args.sparse_embedding: emb_optimizer.step() update_t.append(time.time() - compute_end) step_t = time.time() - start step_time.append(step_t) train_acc = th.sum(logits.argmax( dim=1) == label).item() / len(seeds) if step % args.log_every == 0: print('[{}] Epoch {:05d} | Step {:05d} | Train acc {:.4f} | Loss {:.4f} | time {:.3f} s' \ '| sample {:.3f} | copy {:.3f} | forward {:.3f} | backward {:.3f} | update {:.3f}'.format( g.rank(), epoch, step, train_acc, loss.item(), np.sum(step_time[-args.log_every:]), np.sum(sample_t[-args.log_every:]), np.sum(feat_copy_t[-args.log_every:]), np.sum(forward_t[-args.log_every:]), np.sum(backward_t[-args.log_every:]), np.sum(update_t[-args.log_every:]))) start = time.time() print( '[{}]Epoch Time(s): {:.4f}, sample: {:.4f}, data copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #number_train: {}' .format(g.rank(), np.sum(step_time), np.sum(sample_t), np.sum(feat_copy_t), np.sum(forward_t), np.sum(backward_t), np.sum(update_t), number_train)) epoch += 1 start = time.time() g.barrier() val_acc, test_acc = evaluate(g, model, embed_layer, labels, valid_dataloader, test_dataloader, all_val_nid, all_test_nid) if val_acc >= 0: print('Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}'.format( val_acc, test_acc, time.time() - start))
def run(args, device, data): g, node_feats, num_of_ntype, num_classes, num_rels, \ train_nid, val_nid, test_nid, labels, global_val_nid, global_test_nid = data fanouts = [int(fanout) for fanout in args.fanout.split(',')] val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(',')] sampler = NeighborSampler(g, fanouts, dgl.distributed.sample_neighbors) # Create DataLoader for constructing blocks dataloader = DistDataLoader(dataset=train_nid.numpy(), batch_size=args.batch_size, collate_fn=sampler.sample_blocks, shuffle=True, drop_last=False) valid_sampler = NeighborSampler(g, val_fanouts, dgl.distributed.sample_neighbors) # Create DataLoader for constructing blocks valid_dataloader = DistDataLoader(dataset=val_nid.numpy(), batch_size=args.batch_size, collate_fn=valid_sampler.sample_blocks, shuffle=False, drop_last=False) test_sampler = NeighborSampler(g, [-1] * args.n_layers, dgl.distributed.sample_neighbors) # Create DataLoader for constructing blocks test_dataloader = DistDataLoader(dataset=test_nid.numpy(), batch_size=args.batch_size, collate_fn=test_sampler.sample_blocks, shuffle=False, drop_last=False) embed_layer = DistEmbedLayer(device, g, num_of_ntype, args.n_hidden, sparse_emb=args.sparse_embedding, dgl_sparse_emb=args.dgl_sparse) model = EntityClassify(device, args.n_hidden, num_classes, num_rels, num_bases=args.n_bases, num_hidden_layers=args.n_layers - 2, dropout=args.dropout, use_self_loop=args.use_self_loop, low_mem=args.low_mem, layer_norm=args.layer_norm) model = model.to(device) if not args.standalone: model = th.nn.parallel.DistributedDataParallel(model) if args.sparse_embedding and not args.dgl_sparse: embed_layer = DistributedDataParallel(embed_layer, device_ids=None, output_device=None) if args.sparse_embedding: if args.dgl_sparse: emb_optimizer = dgl.distributed.SparseAdagrad( [embed_layer.node_embeds], lr=args.sparse_lr) else: emb_optimizer = th.optim.SparseAdam( embed_layer.module.node_embeds.parameters(), lr=args.sparse_lr) optimizer = th.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2norm) else: all_params = list(model.parameters()) + list(embed_layer.parameters()) optimizer = th.optim.Adam(all_params, lr=args.lr, weight_decay=args.l2norm) # training loop print("start training...") for epoch in range(args.n_epochs): tic = time.time() sample_time = 0 copy_time = 0 forward_time = 0 backward_time = 0 update_time = 0 number_train = 0 step_time = [] iter_t = [] sample_t = [] feat_copy_t = [] forward_t = [] backward_t = [] update_t = [] iter_tput = [] start = time.time() # Loop over the dataloader to sample the computation dependency graph as a list of # blocks. step_time = [] for step, sample_data in enumerate(dataloader): seeds, blocks = sample_data number_train += seeds.shape[0] tic_step = time.time() sample_time += tic_step - start sample_t.append(tic_step - start) feats = embed_layer(blocks[0].srcdata[dgl.NID], blocks[0].srcdata[dgl.NTYPE], node_feats) label = labels[seeds] copy_time = time.time() feat_copy_t.append(copy_time - tic_step) # forward logits = model(blocks, feats) loss = F.cross_entropy(logits, label) forward_end = time.time() # backward optimizer.zero_grad() if args.sparse_embedding and not args.dgl_sparse: emb_optimizer.zero_grad() loss.backward() optimizer.step() if args.sparse_embedding: emb_optimizer.step() compute_end = time.time() forward_t.append(forward_end - copy_time) backward_t.append(compute_end - forward_end) # Aggregate gradients in multiple nodes. optimizer.step() update_t.append(time.time() - compute_end) step_t = time.time() - start step_time.append(step_t) if step % args.log_every == 0: print('[{}] Epoch {:05d} | Step {:05d} | Loss {:.4f} | time {:.3f} s' \ '| sample {:.3f} | copy {:.3f} | forward {:.3f} | backward {:.3f} | update {:.3f}'.format( g.rank(), epoch, step, loss.item(), np.sum(step_time[-args.log_every:]), np.sum(sample_t[-args.log_every:]), np.sum(feat_copy_t[-args.log_every:]), np.sum(forward_t[-args.log_every:]), np.sum(backward_t[-args.log_every:]), np.sum(update_t[-args.log_every:]))) start = time.time() print( '[{}]Epoch Time(s): {:.4f}, sample: {:.4f}, data copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #number_train: {}' .format(g.rank(), np.sum(step_time), np.sum(sample_t), np.sum(feat_copy_t), np.sum(forward_t), np.sum(backward_t), np.sum(update_t), number_train)) epoch += 1 start = time.time() g.barrier() val_acc, test_acc = evaluate(g, model, embed_layer, labels, valid_dataloader, test_dataloader, node_feats, global_val_nid, global_test_nid) if val_acc >= 0: print('Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}'.format( val_acc, test_acc, time.time() - start))