def run(proc_id, n_gpus, args, devices, dataset): dev_id = devices[proc_id] g, num_of_ntype, num_classes, num_rels, target_idx, \ train_idx, val_idx, test_idx, labels = dataset node_tids = g.ndata[dgl.NTYPE] sampler = NeighborSampler(g, target_idx, [args.fanout] * args.n_layers) loader = DataLoader(dataset=train_idx.numpy(), batch_size=args.batch_size, collate_fn=sampler.sample_blocks, shuffle=True, num_workers=args.num_workers) # validation sampler val_sampler = NeighborSampler(g, target_idx, [None] * args.n_layers) val_loader = DataLoader(dataset=val_idx.numpy(), batch_size=args.batch_size, collate_fn=val_sampler.sample_blocks, shuffle=False, num_workers=args.num_workers) # validation sampler test_sampler = NeighborSampler(g, target_idx, [None] * args.n_layers) test_loader = DataLoader(dataset=test_idx.numpy(), batch_size=args.batch_size, collate_fn=test_sampler.sample_blocks, shuffle=False, num_workers=args.num_workers) if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus backend = 'nccl' if args.sparse_embedding: backend = 'gloo' th.distributed.init_process_group(backend=backend, init_method=dist_init_method, world_size=world_size, rank=dev_id) # node features # None for one-hot feature, if not none, it should be the feature tensor. node_feats = [None] * num_of_ntype embed_layer = RelGraphEmbedLayer(dev_id, g.number_of_nodes(), node_tids, num_of_ntype, node_feats, args.n_hidden, sparse_emb=args.sparse_embedding) # create model model = EntityClassify(dev_id, g.number_of_nodes(), args.n_hidden, num_classes, num_rels, num_bases=args.n_bases, num_hidden_layers=args.n_layers - 2, dropout=args.dropout, use_self_loop=args.use_self_loop, low_mem=args.low_mem) if dev_id >= 0: th.cuda.set_device(dev_id) labels = labels.to(dev_id) model.cuda(dev_id) # embedding layer may not fit into GPU, then use mix_cpu_gpu if args.mix_cpu_gpu is False: embed_layer.cuda(dev_id) if n_gpus > 1: embed_layer = DistributedDataParallel(embed_layer, device_ids=[dev_id], output_device=dev_id) model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) # optimizer if args.sparse_embedding: optimizer = th.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2norm) emb_optimizer = th.optim.SparseAdam(embed_layer.parameters(), lr=args.lr) else: all_params = itertools.chain(model.parameters(), embed_layer.parameters()) optimizer = th.optim.Adam(all_params, lr=args.lr, weight_decay=args.l2norm) # training loop print("start training...") forward_time = [] backward_time = [] for epoch in range(args.n_epochs): model.train() optimizer.zero_grad() if args.sparse_embedding: emb_optimizer.zero_grad() for i, sample_data in enumerate(loader): seeds, blocks = sample_data t0 = time.time() feats = embed_layer(blocks[0].srcdata[dgl.NID].to(dev_id), blocks[0].srcdata[dgl.NTYPE].to(dev_id), node_feats) logits = model(blocks, feats) loss = F.cross_entropy(logits, labels[seeds]) t1 = time.time() loss.backward() optimizer.step() if args.sparse_embedding: emb_optimizer.step() t2 = time.time() forward_time.append(t1 - t0) backward_time.append(t2 - t1) print( "Epoch {:05d}:{:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}" .format(epoch, i, forward_time[-1], backward_time[-1])) train_acc = th.sum(logits.argmax( dim=1) == labels[seeds]).item() / len(seeds) print("Train Accuracy: {:.4f} | Train Loss: {:.4f}".format( train_acc, loss.item())) # only process 0 will do the evaluation if proc_id == 0: model.eval() eval_logtis = [] eval_seeds = [] for i, sample_data in enumerate(val_loader): seeds, blocks = sample_data feats = embed_layer(blocks[0].srcdata[dgl.NID].to(dev_id), blocks[0].srcdata[dgl.NTYPE].to(dev_id), node_feats) logits = model(blocks, feats) eval_logtis.append(logits) eval_seeds.append(seeds) eval_logtis = th.cat(eval_logtis) eval_seeds = th.cat(eval_seeds) val_loss = F.cross_entropy(eval_logtis, labels[eval_seeds]) val_acc = th.sum(eval_logtis.argmax( dim=1) == labels[eval_seeds]).item() / len(eval_seeds) print( "Validation Accuracy: {:.4f} | Validation loss: {:.4f}".format( val_acc, val_loss.item())) if n_gpus > 1: th.distributed.barrier() print() # only process 0 will do the testing if proc_id == 0: model.eval() test_logtis = [] test_seeds = [] for i, sample_data in enumerate(test_loader): seeds, blocks = sample_data feats = embed_layer(blocks[0].srcdata[dgl.NID].to(dev_id), blocks[0].srcdata[dgl.NTYPE].to(dev_id), [None] * num_of_ntype) logits = model(blocks, feats) test_logtis.append(logits) test_seeds.append(seeds) test_logtis = th.cat(test_logtis) test_seeds = th.cat(test_seeds) test_loss = F.cross_entropy(test_logtis, labels[test_seeds]) test_acc = th.sum(test_logtis.argmax( dim=1) == labels[test_seeds]).item() / len(test_seeds) print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format( test_acc, test_loss.item())) print() print("{}/{} Mean forward time: {:4f}".format( proc_id, n_gpus, np.mean(forward_time[len(forward_time) // 4:]))) print("{}/{} Mean backward time: {:4f}".format( proc_id, n_gpus, np.mean(backward_time[len(backward_time) // 4:])))
def run(proc_id, n_gpus, args, devices, dataset, split, queue=None): dev_id = devices[proc_id] g, node_feats, num_of_ntype, num_classes, num_rels, target_idx, \ train_idx, val_idx, test_idx, labels = dataset if split is not None: train_seed, val_seed, test_seed = split train_idx = train_idx[train_seed] val_idx = val_idx[val_seed] test_idx = test_idx[test_seed] fanouts = [int(fanout) for fanout in args.fanout.split(',')] node_tids = g.ndata[dgl.NTYPE] sampler = NeighborSampler(g, target_idx, fanouts) loader = DataLoader(dataset=train_idx.numpy(), batch_size=args.batch_size, collate_fn=sampler.sample_blocks, shuffle=True, num_workers=args.num_workers) # validation sampler val_sampler = NeighborSampler(g, target_idx, [None] * args.n_layers) val_loader = DataLoader(dataset=val_idx.numpy(), batch_size=args.eval_batch_size, collate_fn=val_sampler.sample_blocks, shuffle=False, num_workers=args.num_workers) # validation sampler test_sampler = NeighborSampler(g, target_idx, [None] * args.n_layers) test_loader = DataLoader(dataset=test_idx.numpy(), batch_size=args.eval_batch_size, collate_fn=test_sampler.sample_blocks, shuffle=False, num_workers=args.num_workers) if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') world_size = n_gpus backend = 'nccl' # using sparse embedding or usig mix_cpu_gpu model (embedding model can not be stored in GPU) if args.sparse_embedding or args.mix_cpu_gpu: backend = 'gloo' th.distributed.init_process_group(backend=backend, init_method=dist_init_method, world_size=world_size, rank=dev_id) # node features # None for one-hot feature, if not none, it should be the feature tensor. # embed_layer = RelGraphEmbedLayer(dev_id, g.number_of_nodes(), node_tids, num_of_ntype, node_feats, args.n_hidden, sparse_emb=args.sparse_embedding) # create model # all model params are in device. model = EntityClassify(dev_id, g.number_of_nodes(), args.n_hidden, num_classes, num_rels, num_bases=args.n_bases, num_hidden_layers=args.n_layers - 2, dropout=args.dropout, use_self_loop=args.use_self_loop, low_mem=args.low_mem, layer_norm=args.layer_norm) if dev_id >= 0 and n_gpus == 1: th.cuda.set_device(dev_id) labels = labels.to(dev_id) model.cuda(dev_id) # embedding layer may not fit into GPU, then use mix_cpu_gpu if args.mix_cpu_gpu is False: embed_layer.cuda(dev_id) if n_gpus > 1: labels = labels.to(dev_id) model.cuda(dev_id) if args.mix_cpu_gpu: embed_layer = DistributedDataParallel(embed_layer, device_ids=None, output_device=None) else: embed_layer.cuda(dev_id) embed_layer = DistributedDataParallel(embed_layer, device_ids=[dev_id], output_device=dev_id) model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) # optimizer if args.sparse_embedding: dense_params = list(model.parameters()) if args.node_feats: if n_gpus > 1: dense_params += list(embed_layer.module.embeds.parameters()) else: dense_params += list(embed_layer.embeds.parameters()) optimizer = th.optim.Adam(dense_params, lr=args.lr, weight_decay=args.l2norm) if n_gpus > 1: emb_optimizer = th.optim.SparseAdam( embed_layer.module.node_embeds.parameters(), lr=args.lr) else: emb_optimizer = th.optim.SparseAdam( embed_layer.node_embeds.parameters(), lr=args.lr) else: all_params = list(model.parameters()) + list(embed_layer.parameters()) optimizer = th.optim.Adam(all_params, lr=args.lr, weight_decay=args.l2norm) # training loop print("start training...") forward_time = [] backward_time = [] for epoch in range(args.n_epochs): model.train() embed_layer.train() for i, sample_data in enumerate(loader): seeds, blocks = sample_data t0 = time.time() feats = embed_layer(blocks[0].srcdata[dgl.NID], blocks[0].srcdata[dgl.NTYPE], blocks[0].srcdata['type_id'], node_feats) logits = model(blocks, feats) loss = F.cross_entropy(logits, labels[seeds]) t1 = time.time() optimizer.zero_grad() if args.sparse_embedding: emb_optimizer.zero_grad() loss.backward() optimizer.step() if args.sparse_embedding: emb_optimizer.step() t2 = time.time() forward_time.append(t1 - t0) backward_time.append(t2 - t1) train_acc = th.sum(logits.argmax( dim=1) == labels[seeds]).item() / len(seeds) if i % 100 and proc_id == 0: print("Train Accuracy: {:.4f} | Train Loss: {:.4f}".format( train_acc, loss.item())) print( "Epoch {:05d}:{:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}" .format(epoch, i, forward_time[-1], backward_time[-1])) if (queue is not None) or (proc_id == 0): val_logits, val_seeds = evaluate(model, embed_layer, val_loader, node_feats) if queue is not None: queue.put((val_logits, val_seeds)) # gather evaluation result from multiple processes if proc_id == 0: if queue is not None: val_logits = [] val_seeds = [] for i in range(n_gpus): log = queue.get() val_l, val_s = log val_logits.append(val_l) val_seeds.append(val_s) val_logits = th.cat(val_logits) val_seeds = th.cat(val_seeds) val_loss = F.cross_entropy(val_logits, labels[val_seeds].cpu()).item() val_acc = th.sum( val_logits.argmax(dim=1) == labels[val_seeds].cpu()).item() / len(val_seeds) print("Validation Accuracy: {:.4f} | Validation loss: {:.4f}". format(val_acc, val_loss)) if n_gpus > 1: th.distributed.barrier() # only process 0 will do the evaluation if (queue is not None) or (proc_id == 0): test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats) if queue is not None: queue.put((test_logits, test_seeds)) # gather evaluation result from multiple processes if proc_id == 0: if queue is not None: test_logits = [] test_seeds = [] for i in range(n_gpus): log = queue.get() test_l, test_s = log test_logits.append(test_l) test_seeds.append(test_s) test_logits = th.cat(test_logits) test_seeds = th.cat(test_seeds) test_loss = F.cross_entropy(test_logits, labels[test_seeds].cpu()).item() test_acc = th.sum( test_logits.argmax(dim=1) == labels[test_seeds].cpu()).item() / len(test_seeds) print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format( test_acc, test_loss)) print() # sync for test if n_gpus > 1: th.distributed.barrier() print("{}/{} Mean forward time: {:4f}".format( proc_id, n_gpus, np.mean(forward_time[len(forward_time) // 4:]))) print("{}/{} Mean backward time: {:4f}".format( proc_id, n_gpus, np.mean(backward_time[len(backward_time) // 4:])))
def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None): dev_id = devices[proc_id] if devices[proc_id] != 'cpu' else -1 g, node_feats, num_of_ntype, num_classes, num_rels, target_idx, \ train_idx, val_idx, test_idx, labels = dataset if split is not None: train_seed, val_seed, test_seed = split train_idx = train_idx[train_seed] val_idx = val_idx[val_seed] test_idx = test_idx[test_seed] fanouts = [int(fanout) for fanout in args.fanout.split(',')] node_tids = g.ndata[dgl.NTYPE] sampler = NeighborSampler(g, target_idx, fanouts) loader = DataLoader(dataset=train_idx.numpy(), batch_size=args.batch_size, collate_fn=sampler.sample_blocks, shuffle=True, num_workers=args.num_workers) # validation sampler val_sampler = NeighborSampler(g, target_idx, fanouts) val_loader = DataLoader(dataset=val_idx.numpy(), batch_size=args.batch_size, collate_fn=val_sampler.sample_blocks, shuffle=False, num_workers=args.num_workers) # test sampler test_sampler = NeighborSampler(g, target_idx, [None] * args.n_layers) test_loader = DataLoader(dataset=test_idx.numpy(), batch_size=args.eval_batch_size, collate_fn=test_sampler.sample_blocks, shuffle=False, num_workers=args.num_workers) world_size = n_gpus if n_gpus > 1: dist_init_method = 'tcp://{master_ip}:{master_port}'.format( master_ip='127.0.0.1', master_port='12345') backend = 'nccl' # using sparse embedding or usig mix_cpu_gpu model (embedding model can not be stored in GPU) if args.dgl_sparse is False: backend = 'gloo' print("backend using {}".format(backend)) th.distributed.init_process_group(backend=backend, init_method=dist_init_method, world_size=world_size, rank=dev_id) # node features # None for one-hot feature, if not none, it should be the feature tensor. # embed_layer = RelGraphEmbedLayer(dev_id, g.number_of_nodes(), node_tids, num_of_ntype, node_feats, args.n_hidden, dgl_sparse=args.dgl_sparse) # create model # all model params are in device. model = EntityClassify(dev_id, g.number_of_nodes(), args.n_hidden, num_classes, num_rels, num_bases=args.n_bases, num_hidden_layers=args.n_layers - 2, dropout=args.dropout, use_self_loop=args.use_self_loop, low_mem=args.low_mem, layer_norm=args.layer_norm) if dev_id >= 0 and n_gpus == 1: th.cuda.set_device(dev_id) labels = labels.to(dev_id) model.cuda(dev_id) # with dgl_sparse emb, only node embedding is not in GPU if args.dgl_sparse: embed_layer.cuda(dev_id) if n_gpus > 1: labels = labels.to(dev_id) model.cuda(dev_id) model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id) if args.dgl_sparse: embed_layer.cuda(dev_id) if len(list(embed_layer.parameters())) > 0: embed_layer = DistributedDataParallel(embed_layer, device_ids=[dev_id], output_device=dev_id) else: if len(list(embed_layer.parameters())) > 0: embed_layer = DistributedDataParallel(embed_layer, device_ids=None, output_device=None) # optimizer dense_params = list(model.parameters()) if args.node_feats: if n_gpus > 1: dense_params += list(embed_layer.module.embeds.parameters()) else: dense_params += list(embed_layer.embeds.parameters()) optimizer = th.optim.Adam(dense_params, lr=args.lr, weight_decay=args.l2norm) if args.dgl_sparse: all_params = list(model.parameters()) + list(embed_layer.parameters()) optimizer = th.optim.Adam(all_params, lr=args.lr, weight_decay=args.l2norm) if n_gpus > 1 and isinstance(embed_layer, DistributedDataParallel): dgl_emb = embed_layer.module.dgl_emb else: dgl_emb = embed_layer.dgl_emb emb_optimizer = dgl.optim.SparseAdam(params=dgl_emb, lr=args.sparse_lr, eps=1e-8) if len(dgl_emb) > 0 else None else: if n_gpus > 1: embs = list(embed_layer.module.node_embeds.parameters()) else: embs = list(embed_layer.node_embeds.parameters()) emb_optimizer = th.optim.SparseAdam(embs, lr=args.sparse_lr) if len(embs) > 0 else None # training loop print("start training...") forward_time = [] backward_time = [] train_time = 0 validation_time = 0 test_time = 0 last_val_acc = 0.0 do_test = False if n_gpus > 1 and n_cpus - args.num_workers > 0: th.set_num_threads(n_cpus-args.num_workers) for epoch in range(args.n_epochs): tstart = time.time() model.train() embed_layer.train() for i, sample_data in enumerate(loader): seeds, blocks = sample_data t0 = time.time() feats = embed_layer(blocks[0].srcdata[dgl.NID], blocks[0].srcdata['ntype'], blocks[0].srcdata['type_id'], node_feats) logits = model(blocks, feats) loss = F.cross_entropy(logits, labels[seeds]) t1 = time.time() optimizer.zero_grad() if emb_optimizer is not None: emb_optimizer.zero_grad() loss.backward() if emb_optimizer is not None: emb_optimizer.step() optimizer.step() t2 = time.time() forward_time.append(t1 - t0) backward_time.append(t2 - t1) train_acc = th.sum(logits.argmax(dim=1) == labels[seeds]).item() / len(seeds) if i % 100 and proc_id == 0: print("Train Accuracy: {:.4f} | Train Loss: {:.4f}". format(train_acc, loss.item())) gc.collect() print("Epoch {:05d}:{:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}". format(epoch, args.n_epochs, forward_time[-1], backward_time[-1])) tend = time.time() train_time += (tend - tstart) def collect_eval(): eval_logits = [] eval_seeds = [] for i in range(n_gpus): log = queue.get() eval_l, eval_s = log eval_logits.append(eval_l) eval_seeds.append(eval_s) eval_logits = th.cat(eval_logits) eval_seeds = th.cat(eval_seeds) eval_loss = F.cross_entropy(eval_logits, labels[eval_seeds].cpu()).item() eval_acc = th.sum(eval_logits.argmax(dim=1) == labels[eval_seeds].cpu()).item() / len(eval_seeds) return eval_loss, eval_acc vstart = time.time() if (queue is not None) or (proc_id == 0): val_logits, val_seeds = evaluate(model, embed_layer, val_loader, node_feats) if queue is not None: queue.put((val_logits, val_seeds)) # gather evaluation result from multiple processes if proc_id == 0: val_loss, val_acc = collect_eval() if queue is not None else \ (F.cross_entropy(val_logits, labels[val_seeds].cpu()).item(), \ th.sum(val_logits.argmax(dim=1) == labels[val_seeds].cpu()).item() / len(val_seeds)) do_test = val_acc > last_val_acc last_val_acc = val_acc print("Validation Accuracy: {:.4f} | Validation loss: {:.4f}". format(val_acc, val_loss)) if n_gpus > 1: th.distributed.barrier() if proc_id == 0: for i in range(1, n_gpus): queue.put(do_test) else: do_test = queue.get() vend = time.time() validation_time += (vend - vstart) if epoch > 0 and do_test: tstart = time.time() if (queue is not None) or (proc_id == 0): test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats) if queue is not None: queue.put((test_logits, test_seeds)) # gather evaluation result from multiple processes if proc_id == 0: test_loss, test_acc = collect_eval() if queue is not None else \ (F.cross_entropy(test_logits, labels[test_seeds].cpu()).item(), \ th.sum(test_logits.argmax(dim=1) == labels[test_seeds].cpu()).item() / len(test_seeds)) print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss)) print() tend = time.time() test_time += (tend-tstart) # sync for test if n_gpus > 1: th.distributed.barrier() print("{}/{} Mean forward time: {:4f}".format(proc_id, n_gpus, np.mean(forward_time[len(forward_time) // 4:]))) print("{}/{} Mean backward time: {:4f}".format(proc_id, n_gpus, np.mean(backward_time[len(backward_time) // 4:]))) if proc_id == 0: print("Final Test Accuracy: {:.4f} | Test loss: {:.4f}".format(test_acc, test_loss)) print("Train {}s, valid {}s, test {}s".format(train_time, validation_time, test_time))