示例#1
0
    def inference(self, standalone, g, x, batch_size, device):
        """
        Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
        g : the entire graph.
        x : the input of entire node set.

        The inference code is written in a fashion that it could handle any number of nodes and
        layers.
        """
        # During inference with sampling, multi-layer blocks are very inefficient because
        # lots of computations in the first few layers are repeated.
        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
        # on each layer are of course splitted in batches.
        # TODO: can we standardize this?
        nodes = dgl.distributed.node_split(np.arange(g.number_of_nodes()),
                                           g.get_partition_book(),
                                           force_even=True)
        y = dgl.distributed.DistTensor((g.number_of_nodes(), self.n_hidden),
                                       th.float32,
                                       'h',
                                       persistent=True)
        for l, layer in enumerate(self.layers):
            if l == len(self.layers) - 1:
                y = dgl.distributed.DistTensor(
                    (g.number_of_nodes(), self.n_classes),
                    th.float32,
                    'h_last',
                    persistent=True)

            sampler = NeighborSampler(g, [-1],
                                      dgl.distributed.sample_neighbors,
                                      device,
                                      load_feat=False)
            print('|V|={}, eval batch size: {}'.format(g.number_of_nodes(),
                                                       batch_size))
            # Create PyTorch DataLoader for constructing blocks
            dataloader = DistDataLoader(dataset=nodes,
                                        batch_size=batch_size,
                                        collate_fn=sampler.sample_blocks,
                                        shuffle=False,
                                        drop_last=False)

            for blocks in tqdm.tqdm(dataloader):
                block = blocks[0].to(device)
                input_nodes = block.srcdata[dgl.NID]
                output_nodes = block.dstdata[dgl.NID]
                h = x[input_nodes].to(device)
                h_dst = h[:block.number_of_dst_nodes()]
                h = layer(block, (h, h_dst))
                if l != len(self.layers) - 1:
                    h = self.activation(h)
                    h = self.dropout(h)

                y[output_nodes] = h.cpu()

            x = y
            g.barrier()
        return y
示例#2
0
def start_dist_dataloader(rank, tmpdir, num_server, drop_last):
    import dgl
    import torch as th
    dgl.distributed.initialize("mp_ip_config.txt")
    gpb = None
    disable_shared_mem = num_server > 0
    if disable_shared_mem:
        _, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank)
    num_nodes_to_sample = 202
    batch_size = 32
    train_nid = th.arange(num_nodes_to_sample)
    dist_graph = DistGraph("test_mp", gpb=gpb, part_config=tmpdir / 'test_sampling.json')

    orig_nid = F.arange(0, dist_graph.number_of_nodes())
    orig_eid = F.arange(0, dist_graph.number_of_edges())
    for i in range(num_server):
        part, _, _, _, _, _, _ = load_partition(tmpdir / 'test_sampling.json', i)
        if 'orig_id' in part.ndata:
            orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id']
        if 'orig_id' in part.edata:
            orig_eid[part.edata[dgl.EID]] = part.edata['orig_id']

    # Create sampler
    sampler = NeighborSampler(dist_graph, [5, 10],
                              dgl.distributed.sample_neighbors)

    # We need to test creating DistDataLoader multiple times.
    for i in range(2):
        # Create DataLoader for constructing blocks
        dataloader = DistDataLoader(
            dataset=train_nid.numpy(),
            batch_size=batch_size,
            collate_fn=sampler.sample_blocks,
            shuffle=False,
            drop_last=drop_last)

        groundtruth_g = CitationGraphDataset("cora")[0]
        max_nid = []

        for epoch in range(2):
            for idx, blocks in zip(range(0, num_nodes_to_sample, batch_size), dataloader):
                block = blocks[-1]
                o_src, o_dst =  block.edges()
                src_nodes_id = block.srcdata[dgl.NID][o_src]
                dst_nodes_id = block.dstdata[dgl.NID][o_dst]
                max_nid.append(np.max(F.asnumpy(dst_nodes_id)))

                src_nodes_id = orig_nid[src_nodes_id]
                dst_nodes_id = orig_nid[dst_nodes_id]
                has_edges = groundtruth_g.has_edges_between(src_nodes_id, dst_nodes_id)
                assert np.all(F.asnumpy(has_edges))
                # assert np.all(np.unique(np.sort(F.asnumpy(dst_nodes_id))) == np.arange(idx, batch_size))
            if drop_last:
                assert np.max(max_nid) == num_nodes_to_sample - 1 - num_nodes_to_sample % batch_size
            else:
                assert np.max(max_nid) == num_nodes_to_sample - 1
    del dataloader
    dgl.distributed.exit_client() # this is needed since there's two test here in one process
示例#3
0
def start_client(rank, tmpdir, disable_shared_mem, num_workers, drop_last):
    import dgl
    import torch as th
    os.environ['DGL_DIST_MODE'] = 'distributed'
    dgl.distributed.initialize("mp_ip_config.txt", num_workers=4)
    gpb = None
    if disable_shared_mem:
        _, _, _, gpb, _ = load_partition(tmpdir / 'test_sampling.json', rank)
    num_nodes_to_sample = 202
    batch_size = 32
    train_nid = th.arange(num_nodes_to_sample)
    dist_graph = DistGraph("mp_ip_config.txt", "test_mp", gpb=gpb)

    # Create sampler
    sampler = NeighborSampler(dist_graph, [5, 10],
                              dgl.distributed.sample_neighbors)

    # We need to test creating DistDataLoader multiple times.
    for i in range(2):
        # Create DataLoader for constructing blocks
        dataloader = DistDataLoader(dataset=train_nid.numpy(),
                                    batch_size=batch_size,
                                    collate_fn=sampler.sample_blocks,
                                    shuffle=False,
                                    drop_last=drop_last)

        groundtruth_g = CitationGraphDataset("cora")[0]
        max_nid = []

        for epoch in range(2):
            for idx, blocks in zip(range(0, num_nodes_to_sample, batch_size),
                                   dataloader):
                block = blocks[-1]
                o_src, o_dst = block.edges()
                src_nodes_id = block.srcdata[dgl.NID][o_src]
                dst_nodes_id = block.dstdata[dgl.NID][o_dst]
                has_edges = groundtruth_g.has_edges_between(
                    src_nodes_id, dst_nodes_id)
                assert np.all(F.asnumpy(has_edges))
                print(np.unique(np.sort(F.asnumpy(dst_nodes_id))))
                max_nid.append(np.max(F.asnumpy(dst_nodes_id)))
                # assert np.all(np.unique(np.sort(F.asnumpy(dst_nodes_id))) == np.arange(idx, batch_size))
            if drop_last:
                assert np.max(
                    max_nid
                ) == num_nodes_to_sample - 1 - num_nodes_to_sample % batch_size
            else:
                assert np.max(max_nid) == num_nodes_to_sample - 1

    dgl.distributed.exit_client(
    )  # this is needed since there's two test here in one process
示例#4
0
def run(args, device, data):
    # Unpack data
    train_nid, val_nid, test_nid, in_feats, n_classes, g = data
    # Create sampler
    sampler = NeighborSampler(g, [int(fanout) for fanout in args.fan_out.split(',')],
                              dgl.distributed.sample_neighbors, device)

    # Create DataLoader for constructing blocks
    dataloader = DistDataLoader(
        dataset=train_nid.numpy(),
        batch_size=args.batch_size,
        collate_fn=sampler.sample_blocks,
        shuffle=True,
        drop_last=False)

    # Define model and optimizer
    model = DistSAGE(in_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout)
    model = model.to(device)
    if not args.standalone:
        if args.num_gpus == -1:
            model = th.nn.parallel.DistributedDataParallel(model)
        else:
            dev_id = g.rank() % args.num_gpus
            model = th.nn.parallel.DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)
    loss_fcn = nn.CrossEntropyLoss()
    loss_fcn = loss_fcn.to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    train_size = th.sum(g.ndata['train_mask'][0:g.number_of_nodes()])

    # Training loop
    iter_tput = []
    profiler = Profiler()
    if args.close_profiler == False:
        profiler.start()
    epoch = 0
    for epoch in range(args.num_epochs):
        tic = time.time()

        sample_time = 0
        fetch_data_time = 0
        copy_dev_time = 0
        forward_time = 0
        backward_time = 0
        update_time = 0
        num_seeds = 0
        num_inputs = 0
        start = time.time()
        # Loop over the dataloader to sample the computation dependency graph as a list of
        # blocks.
        step_time = []
        for step, blocks in enumerate(dataloader):
            tic_step = time.time()
            sample_time += tic_step - start

            # The nodes for input lies at the LHS side of the first block.
            # The nodes for output lies at the RHS side of the last block.

            start = time.time()
            input_nodes = blocks[0].srcdata[dgl.NID]
            seeds = blocks[-1].dstdata[dgl.NID]
            batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, "cpu")
            batch_labels = batch_labels.long()
            fetch_data_time += time.time() - start

            start = time.time()
            num_seeds += len(blocks[-1].dstdata[dgl.NID])
            num_inputs += len(blocks[0].srcdata[dgl.NID])
            blocks = [block.to(device) for block in blocks]
            batch_labels = batch_labels.to(device)
            batch_inputs = batch_inputs.to(device)
            copy_dev_time += time.time() - start

            # Compute loss and prediction
            start = time.time()
            batch_pred = model(blocks, batch_inputs)
            loss = loss_fcn(batch_pred, batch_labels)
            forward_end = time.time()
            optimizer.zero_grad()
            loss.backward()
            compute_end = time.time()
            forward_time += forward_end - start
            backward_time += compute_end - forward_end

            optimizer.step()
            update_time += time.time() - compute_end

            step_t = time.time() - tic_step
            step_time.append(step_t)
            iter_tput.append(len(blocks[-1].dstdata[dgl.NID]) / step_t)
            if step % args.log_every == 0:
                acc = compute_acc(batch_pred, batch_labels)
                gpu_mem_alloc = th.cuda.max_memory_allocated() / 1000000 if th.cuda.is_available() else 0
                print('Part {} | Epoch {:05d} | Step {:05d} | Loss {:.4f} | Train Acc {:.4f} | Speed (samples/sec) {:.4f} | GPU {:.1f} MiB | time {:.3f} s'.format(
                    g.rank(), epoch, step, loss.item(), acc.item(), np.mean(iter_tput[3:]), gpu_mem_alloc, np.sum(step_time[-args.log_every:])))
            start = time.time()

        toc = time.time()
        print('Part {}, Epoch Time(s): {:.4f}, sample: {:.4f}, fetch feats: {:.4f}, copy dev: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #seeds: {}, #inputs: {}'.format(
            g.rank(), toc - tic, sample_time, fetch_data_time, copy_dev_time, forward_time, backward_time, update_time, num_seeds, num_inputs))
        epoch += 1


        if epoch % args.eval_every == 0 and epoch != 0:
            start = time.time()
            val_acc, test_acc = evaluate(model.module, g, g.ndata['features'],
                                         g.ndata['labels'], val_nid, test_nid, args.batch_size_eval, device)
            print('Part {}, Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}'.format(g.rank(), val_acc, test_acc,
                                                                                  time.time() - start))
    if args.close_profiler == False:
        profiler.stop()
        print(profiler.output_text(unicode=True, color=True))
示例#5
0
def run(args, device, data):
    g, num_classes, train_nid, val_nid, test_nid, labels, all_val_nid, all_test_nid = data
    num_rels = len(g.etypes)

    fanouts = [int(fanout) for fanout in args.fanout.split(',')]
    val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(',')]
    sampler = NeighborSampler(g, fanouts, dgl.distributed.sample_neighbors)
    # Create DataLoader for constructing blocks
    dataloader = DistDataLoader(dataset=train_nid,
                                batch_size=args.batch_size,
                                collate_fn=sampler.sample_blocks,
                                shuffle=True,
                                drop_last=False)

    valid_sampler = NeighborSampler(g, val_fanouts,
                                    dgl.distributed.sample_neighbors)
    # Create DataLoader for constructing blocks
    valid_dataloader = DistDataLoader(dataset=val_nid,
                                      batch_size=args.batch_size,
                                      collate_fn=valid_sampler.sample_blocks,
                                      shuffle=False,
                                      drop_last=False)

    test_sampler = NeighborSampler(g, [-1] * args.n_layers,
                                   dgl.distributed.sample_neighbors)
    # Create DataLoader for constructing blocks
    test_dataloader = DistDataLoader(dataset=test_nid,
                                     batch_size=args.batch_size,
                                     collate_fn=test_sampler.sample_blocks,
                                     shuffle=False,
                                     drop_last=False)

    embed_layer = DistEmbedLayer(device,
                                 g,
                                 args.n_hidden,
                                 sparse_emb=args.sparse_embedding,
                                 dgl_sparse_emb=args.dgl_sparse,
                                 feat_name='feat')

    model = EntityClassify(device,
                           args.n_hidden,
                           num_classes,
                           num_rels,
                           num_bases=args.n_bases,
                           num_hidden_layers=args.n_layers - 2,
                           dropout=args.dropout,
                           use_self_loop=args.use_self_loop,
                           low_mem=args.low_mem,
                           layer_norm=args.layer_norm)
    model = model.to(device)

    if not args.standalone:
        if args.num_gpus == -1:
            model = DistributedDataParallel(model)
            # If there are dense parameters in the embedding layer
            # or we use Pytorch saprse embeddings.
            if len(embed_layer.node_projs) > 0 or not args.dgl_sparse:
                embed_layer = DistributedDataParallel(embed_layer)
        else:
            dev_id = g.rank() % args.num_gpus
            model = DistributedDataParallel(model,
                                            device_ids=[dev_id],
                                            output_device=dev_id)
            # If there are dense parameters in the embedding layer
            # or we use Pytorch saprse embeddings.
            if len(embed_layer.node_projs) > 0 or not args.dgl_sparse:
                embed_layer = embed_layer.to(device)
                embed_layer = DistributedDataParallel(embed_layer,
                                                      device_ids=[dev_id],
                                                      output_device=dev_id)

    if args.sparse_embedding:
        if args.dgl_sparse and args.standalone:
            emb_optimizer = dgl.distributed.SparseAdagrad(list(
                embed_layer.node_embeds.values()),
                                                          lr=args.sparse_lr)
            print('optimize DGL sparse embedding:',
                  embed_layer.node_embeds.keys())
        elif args.dgl_sparse:
            emb_optimizer = dgl.distributed.SparseAdagrad(list(
                embed_layer.module.node_embeds.values()),
                                                          lr=args.sparse_lr)
            print('optimize DGL sparse embedding:',
                  embed_layer.module.node_embeds.keys())
        elif args.standalone:
            emb_optimizer = th.optim.SparseAdam(list(
                embed_layer.node_embeds.parameters()),
                                                lr=args.sparse_lr)
            print('optimize Pytorch sparse embedding:',
                  embed_layer.node_embeds)
        else:
            emb_optimizer = th.optim.SparseAdam(list(
                embed_layer.module.node_embeds.parameters()),
                                                lr=args.sparse_lr)
            print('optimize Pytorch sparse embedding:',
                  embed_layer.module.node_embeds)

        dense_params = list(model.parameters())
        if args.standalone:
            dense_params += list(embed_layer.node_projs.parameters())
            print('optimize dense projection:', embed_layer.node_projs)
        else:
            dense_params += list(embed_layer.module.node_projs.parameters())
            print('optimize dense projection:', embed_layer.module.node_projs)
        optimizer = th.optim.Adam(dense_params,
                                  lr=args.lr,
                                  weight_decay=args.l2norm)
    else:
        all_params = list(model.parameters()) + list(embed_layer.parameters())
        optimizer = th.optim.Adam(all_params,
                                  lr=args.lr,
                                  weight_decay=args.l2norm)

    # training loop
    print("start training...")
    for epoch in range(args.n_epochs):
        tic = time.time()

        sample_time = 0
        copy_time = 0
        forward_time = 0
        backward_time = 0
        update_time = 0
        number_train = 0

        step_time = []
        iter_t = []
        sample_t = []
        feat_copy_t = []
        forward_t = []
        backward_t = []
        update_t = []
        iter_tput = []

        start = time.time()
        # Loop over the dataloader to sample the computation dependency graph as a list of
        # blocks.
        step_time = []
        for step, sample_data in enumerate(dataloader):
            seeds, blocks = sample_data
            number_train += seeds.shape[0]
            tic_step = time.time()
            sample_time += tic_step - start
            sample_t.append(tic_step - start)

            for block in blocks:
                gen_norm(block)
            feats = embed_layer(blocks[0].srcdata[dgl.NID],
                                blocks[0].srcdata[dgl.NTYPE])
            label = labels[seeds].to(device)
            copy_time = time.time()
            feat_copy_t.append(copy_time - tic_step)

            # forward
            logits = model(blocks, feats)
            loss = F.cross_entropy(logits, label)
            forward_end = time.time()

            # backward
            optimizer.zero_grad()
            if args.sparse_embedding:
                emb_optimizer.zero_grad()
            loss.backward()
            compute_end = time.time()
            forward_t.append(forward_end - copy_time)
            backward_t.append(compute_end - forward_end)

            # Update model parameters
            optimizer.step()
            if args.sparse_embedding:
                emb_optimizer.step()
            update_t.append(time.time() - compute_end)
            step_t = time.time() - start
            step_time.append(step_t)

            train_acc = th.sum(logits.argmax(
                dim=1) == label).item() / len(seeds)

            if step % args.log_every == 0:
                print('[{}] Epoch {:05d} | Step {:05d} | Train acc {:.4f} | Loss {:.4f} | time {:.3f} s' \
                        '| sample {:.3f} | copy {:.3f} | forward {:.3f} | backward {:.3f} | update {:.3f}'.format(
                    g.rank(), epoch, step, train_acc, loss.item(), np.sum(step_time[-args.log_every:]),
                    np.sum(sample_t[-args.log_every:]), np.sum(feat_copy_t[-args.log_every:]), np.sum(forward_t[-args.log_every:]),
                    np.sum(backward_t[-args.log_every:]), np.sum(update_t[-args.log_every:])))
            start = time.time()

        print(
            '[{}]Epoch Time(s): {:.4f}, sample: {:.4f}, data copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #number_train: {}'
            .format(g.rank(), np.sum(step_time), np.sum(sample_t),
                    np.sum(feat_copy_t), np.sum(forward_t), np.sum(backward_t),
                    np.sum(update_t), number_train))
        epoch += 1

        start = time.time()
        g.barrier()
        val_acc, test_acc = evaluate(g, model, embed_layer, labels,
                                     valid_dataloader, test_dataloader,
                                     all_val_nid, all_test_nid)
        if val_acc >= 0:
            print('Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}'.format(
                val_acc, test_acc,
                time.time() - start))
示例#6
0
def run(args, device, data):
    g, node_feats, num_of_ntype, num_classes, num_rels, \
        train_nid, val_nid, test_nid, labels, global_val_nid, global_test_nid = data

    fanouts = [int(fanout) for fanout in args.fanout.split(',')]
    val_fanouts = [int(fanout) for fanout in args.validation_fanout.split(',')]
    sampler = NeighborSampler(g, fanouts, dgl.distributed.sample_neighbors)
    # Create DataLoader for constructing blocks
    dataloader = DistDataLoader(dataset=train_nid.numpy(),
                                batch_size=args.batch_size,
                                collate_fn=sampler.sample_blocks,
                                shuffle=True,
                                drop_last=False)

    valid_sampler = NeighborSampler(g, val_fanouts,
                                    dgl.distributed.sample_neighbors)
    # Create DataLoader for constructing blocks
    valid_dataloader = DistDataLoader(dataset=val_nid.numpy(),
                                      batch_size=args.batch_size,
                                      collate_fn=valid_sampler.sample_blocks,
                                      shuffle=False,
                                      drop_last=False)

    test_sampler = NeighborSampler(g, [-1] * args.n_layers,
                                   dgl.distributed.sample_neighbors)
    # Create DataLoader for constructing blocks
    test_dataloader = DistDataLoader(dataset=test_nid.numpy(),
                                     batch_size=args.batch_size,
                                     collate_fn=test_sampler.sample_blocks,
                                     shuffle=False,
                                     drop_last=False)

    embed_layer = DistEmbedLayer(device,
                                 g,
                                 num_of_ntype,
                                 args.n_hidden,
                                 sparse_emb=args.sparse_embedding,
                                 dgl_sparse_emb=args.dgl_sparse)

    model = EntityClassify(device,
                           args.n_hidden,
                           num_classes,
                           num_rels,
                           num_bases=args.n_bases,
                           num_hidden_layers=args.n_layers - 2,
                           dropout=args.dropout,
                           use_self_loop=args.use_self_loop,
                           low_mem=args.low_mem,
                           layer_norm=args.layer_norm)
    model = model.to(device)
    if not args.standalone:
        model = th.nn.parallel.DistributedDataParallel(model)
        if args.sparse_embedding and not args.dgl_sparse:
            embed_layer = DistributedDataParallel(embed_layer,
                                                  device_ids=None,
                                                  output_device=None)

    if args.sparse_embedding:
        if args.dgl_sparse:
            emb_optimizer = dgl.distributed.SparseAdagrad(
                [embed_layer.node_embeds], lr=args.sparse_lr)
        else:
            emb_optimizer = th.optim.SparseAdam(
                embed_layer.module.node_embeds.parameters(), lr=args.sparse_lr)
        optimizer = th.optim.Adam(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.l2norm)
    else:
        all_params = list(model.parameters()) + list(embed_layer.parameters())
        optimizer = th.optim.Adam(all_params,
                                  lr=args.lr,
                                  weight_decay=args.l2norm)

    # training loop
    print("start training...")
    for epoch in range(args.n_epochs):
        tic = time.time()

        sample_time = 0
        copy_time = 0
        forward_time = 0
        backward_time = 0
        update_time = 0
        number_train = 0

        step_time = []
        iter_t = []
        sample_t = []
        feat_copy_t = []
        forward_t = []
        backward_t = []
        update_t = []
        iter_tput = []

        start = time.time()
        # Loop over the dataloader to sample the computation dependency graph as a list of
        # blocks.
        step_time = []
        for step, sample_data in enumerate(dataloader):
            seeds, blocks = sample_data
            number_train += seeds.shape[0]
            tic_step = time.time()
            sample_time += tic_step - start
            sample_t.append(tic_step - start)

            feats = embed_layer(blocks[0].srcdata[dgl.NID],
                                blocks[0].srcdata[dgl.NTYPE], node_feats)
            label = labels[seeds]
            copy_time = time.time()
            feat_copy_t.append(copy_time - tic_step)

            # forward
            logits = model(blocks, feats)
            loss = F.cross_entropy(logits, label)
            forward_end = time.time()

            # backward
            optimizer.zero_grad()
            if args.sparse_embedding and not args.dgl_sparse:
                emb_optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if args.sparse_embedding:
                emb_optimizer.step()
            compute_end = time.time()
            forward_t.append(forward_end - copy_time)
            backward_t.append(compute_end - forward_end)

            # Aggregate gradients in multiple nodes.
            optimizer.step()
            update_t.append(time.time() - compute_end)
            step_t = time.time() - start
            step_time.append(step_t)

            if step % args.log_every == 0:
                print('[{}] Epoch {:05d} | Step {:05d} | Loss {:.4f} | time {:.3f} s' \
                        '| sample {:.3f} | copy {:.3f} | forward {:.3f} | backward {:.3f} | update {:.3f}'.format(
                    g.rank(), epoch, step, loss.item(), np.sum(step_time[-args.log_every:]),
                    np.sum(sample_t[-args.log_every:]), np.sum(feat_copy_t[-args.log_every:]), np.sum(forward_t[-args.log_every:]),
                    np.sum(backward_t[-args.log_every:]), np.sum(update_t[-args.log_every:])))
            start = time.time()

        print(
            '[{}]Epoch Time(s): {:.4f}, sample: {:.4f}, data copy: {:.4f}, forward: {:.4f}, backward: {:.4f}, update: {:.4f}, #number_train: {}'
            .format(g.rank(), np.sum(step_time), np.sum(sample_t),
                    np.sum(feat_copy_t), np.sum(forward_t), np.sum(backward_t),
                    np.sum(update_t), number_train))
        epoch += 1

        start = time.time()
        g.barrier()
        val_acc, test_acc = evaluate(g, model, embed_layer, labels,
                                     valid_dataloader, test_dataloader,
                                     node_feats, global_val_nid,
                                     global_test_nid)
        if val_acc >= 0:
            print('Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}'.format(
                val_acc, test_acc,
                time.time() - start))