示例#1
0
    def test_queue_reduction(self):
        # Set up process group.
        store = c10d.FileStore(self.file.name)
        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        # Get this process' split of devices.
        devices = gpus_for_rank(self.world_size)[self.rank]
        grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) *
                       (self.rank + 1)).chunk(5)
                       for d in devices]

        work, local_grad_sum = c10d._queue_reduction(process_group,
                                                     grads_batch,
                                                     devices)
        # The first return value should be the allreduce work item.
        self.assertTrue(isinstance(work, c10d.Work))
        # The second return value will be the finished allreduced gradients.
        self.assertTrue(isinstance(local_grad_sum, torch.Tensor))

        # Wait for the allreduce to finish.
        work.wait()

        # The expected result of the allreduce should be the average
        self.assertEqual(local_grad_sum,
                         torch.ones(10) * (self.world_size + 1) / 2.0)
示例#2
0
 def _create_wrapper_pg(self, with_new_group=False, timeout=10.0):
     store = c10d.FileStore(self.file_name, self.world_size)
     c10d.init_process_group(
         backend="nccl",
         rank=self.rank,
         world_size=self.world_size,
         store=store,
         timeout=timedelta(seconds=timeout),
     )
     if with_new_group:
         pg = c10d.new_group(backend="nccl",
                             timeout=timedelta(seconds=timeout))
     else:
         _pg = c10d.ProcessGroupNCCL(store,
                                     self.rank,
                                     self.world_size,
                                     timeout=timedelta(seconds=timeout))
         pg = c10d._create_process_group_wrapper(
             _pg,
             "unused",
             store,
             self.rank,
             self.world_size,
             timeout=timeout,
         )
     return pg
示例#3
0
    def test_is_last_hook(self):

        store = dist.FileStore(self.file_name, self.world_size)
        process_group = dist.ProcessGroupNCCL(store, self.rank,
                                              self.world_size)

        def hook(flags, bucket):
            flags.append(bucket.is_last())
            fut = torch.futures.Future()
            fut.set_result(bucket.buffer())
            return fut

        flags = []
        device_id = gpus_for_rank(self.world_size)[self.rank][0]
        model = nn.Sequential(
            nn.Linear(2, 4000, bias=False),
            *[nn.Linear(4000, 4000, bias=False) for _ in range(10)])
        gpu_model = DistributedDataParallel(
            model.to(device_id),
            device_ids=[device_id],
            process_group=process_group,
        )
        gpu_model.register_comm_hook(state=flags, hook=hook)
        input = torch.randn(10, 2)
        gpu_model(input).sum().backward()
        self.assertTrue(flags[-1])
        self.assertFalse(any(flags[:-1]))
示例#4
0
    def test_fp16(self):
        store = c10d.TCPStore('localhost', self.port, self.rank == 0)
        process_group = c10d.ProcessGroupNCCL(store, self.rank,
                                              self.world_size)

        gpus = gpus_for_rank(self.world_size)[self.rank]
        model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half()
        nn.init.constant_(model.weight, 1)
        ddp_model = DistributedDataParallel(
            model,
            device_ids=[gpus[0]],
            process_group=process_group,
            bucket_cap_mb=1,
        )

        # Input 2**15, so that the gradients will overflow with a
        # world_size of 2, unless we normalize the gradient by the
        # world_size before the reduction
        input = torch.Tensor([[2**15]]).cuda(gpus[0]).half()

        # Step model
        ddp_model.train()
        output = ddp_model(input)
        loss = output.sum()
        loss.backward()

        self.assertFalse(
            any(torch.isinf(p.grad).any() for p in ddp_model.parameters()))
示例#5
0
    def train(self, model, data):
        torch.manual_seed(0)
        model = model.cuda(self.rank)
        for i in range(len(data)):
            data[i][0] = data[i][0].cuda(self.rank)
            data[i][1] = data[i][1].cuda(self.rank)
        torch.cuda.synchronize(self.rank)

        process_group_size = self.trainer_count

        store = c10d.FileStore("/tmp/tmpn_k_8so02", process_group_size)

        process_group = c10d.ProcessGroupNCCL(store, self.rank,
                                              process_group_size)

        ddp_model = DDP(model,
                        device_ids=[self.rank],
                        process_group=process_group)

        hook_state = self.HookState(self, process_group)

        ddp_model.register_comm_hook(hook_state, DdpNcclTrainer.hook)

        criterion = nn.CrossEntropyLoss().cuda(self.rank)

        optimizer = torch.optim.SGD(ddp_model.parameters(), 1e-4)

        def epoch_key(epoch, index):
            return f"{epoch},{index}"

        for epoch in range(self.epochs):
            for index, batch in enumerate(data):
                hook_state.next_batch_state()
                input, target = batch[0], batch[1]

                self.record_batch_start(epoch_key(epoch, index))

                optimizer.zero_grad()

                self.record_forward_start(epoch_key(epoch, index))

                out = ddp_model(input)

                self.record_forward_end(epoch_key(epoch, index))

                loss = criterion(out, target)

                self.record_backward_start(epoch_key(epoch, index))

                loss.backward()

                self.record_backward_end(epoch_key(epoch, index))

                optimizer.step()

                self.record_batch_end(epoch_key(epoch, index))

        torch.cuda.synchronize(self.rank)
示例#6
0
 def test_nccl_backend(self):
     store = c10d.FileStore(self.file.name)
     process_group = c10d.ProcessGroupNCCL(store, self.rank,
                                           self.world_size)
     gpus = gpus_for_rank(self.world_size)[self.rank]
     self._test_ddp_with_process_group(process_group, gpus)
     self._test_ddp_with_process_group(
         process_group,
         list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
示例#7
0
 def test_nccl_backend(self):
     store = c10d.TCPStore('localhost', self.port, self.is_master)
     process_group = c10d.ProcessGroupNCCL(store, self.rank,
                                           self.world_size)
     gpus = gpus_for_rank(self.world_size)[self.rank]
     self._test_ddp_with_process_group(process_group, gpus)
     self._test_ddp_with_process_group(
         process_group,
         list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
示例#8
0
    def test_allreduce_ops(self):
        store = c10d.FileStore(self.file.name)
        pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        def allreduce(tensors, op):
            opts = c10d.AllreduceOptions()
            opts.reduceOp = op
            work = pg.allreduce(tensors, opts)
            work.wait()

        # Sum
        tensors = []
        for i in range(self.num_gpus):
            tensors.append(torch.Tensor([i + 1]).cuda(i))

        allreduce(tensors, c10d.ReduceOp.SUM)

        for i in range(self.num_gpus):
            self.assertEqual(
                torch.Tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]),
                tensors[i])

        # Product
        tensors = []
        for i in range(self.num_gpus):
            tensors.append(torch.Tensor([i + 1]).cuda(i))

        allreduce(tensors, c10d.ReduceOp.PRODUCT)

        for i in range(self.num_gpus):
            self.assertEqual(
                torch.Tensor([float(math.factorial(self.num_gpus))]),
                tensors[i])

        # Min
        tensors = []
        for i in range(self.num_gpus):
            tensors.append(torch.Tensor([i + 1]).cuda(i))

        allreduce(tensors, c10d.ReduceOp.MIN)

        for i in range(self.num_gpus):
            self.assertEqual(torch.Tensor([1.0]), tensors[i])

        # Max
        tensors = []
        for i in range(self.num_gpus):
            tensors.append(torch.Tensor([i + 1]).cuda(i))

        allreduce(tensors, c10d.ReduceOp.MAX)

        for i in range(self.num_gpus):
            self.assertEqual(torch.Tensor([self.num_gpus]), tensors[i])
示例#9
0
 def _nccl_init(self, nccl_addr, nccl_ip, nccl_port):
     self.nccl_ip, self.nccl_addr, self.nccl_port = nccl_ip, nccl_addr, nccl_port
     print('Rank {} calling init_process_group. Addr: {}'.format(self.rank, nccl_addr))
     # from https://github.com/pytorch/pytorch/blob/master/test/simulate_nccl_errors.py
     store = dist.TCPStore(self.nccl_ip, self.nccl_port, self.nb_learners, self.rank == 0)
     process_group = dist.ProcessGroupNCCL(store, self.rank, self.nb_learners)
     print('Rank {} initialized process group.'.format(self.rank))
     process_group.barrier()
     print('Rank {} process group barrier finished.'.format(self.rank))
     self.process_group = process_group
     # set optimizer process_group
     self.optimizer.set_process_group(self.process_group)
示例#10
0
    def test_ddp_comm_hook_allreduce_hook(self):
        """
        This unit test verifies the ``allreduce`` hook registered case gives same result
        with no hook registered case.
        """
        store = c10d.FileStore(self.file_name, self.world_size)
        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        # No hook registered case, get the reference grads.
        reference_grads = self._get_grads(process_group, None)
        # Register hook case, get the hook grads.
        hook_grads = self._get_grads(process_group, DDPCommHookType.ALLREDUCE)

        np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=0)
示例#11
0
    def test_ddp_comm_hook_quantize_per_tensor_hook(self):
        """
        This unit test verifies the ``quantize per tensor`` hook registered case
        gives close result with no hook registered case.
        """
        store = dist.FileStore(self.file_name, self.world_size)
        process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size)

        # No hook registered case, get the reference grads.
        reference_grads = self._get_grads(process_group, None)
        # Register hook case, get the hook grads.
        hook_grads = self._get_grads(process_group, DDPCommHookType.QUANTIZE_PER_TENSOR)

        np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
示例#12
0
    def test_ddp_comm_hook_fp16compress_hook(self):
        """
        This unit test verifies the ``fp16 compress`` hook registered case
        gives close result with no hook registered case.
        """
        store = dist.FileStore(self.file_name, self.world_size)
        process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size)

        # No hook registered case, get the reference grads.
        reference_grads = self._get_grads(process_group, None)
        # Register hook case, get the hook grads.
        hook_grads = self._get_grads(process_group, DDPCommHookType.FP16_COMPRESS)

        np.testing.assert_allclose(hook_grads, reference_grads, rtol=1e-5, atol=1e-4)
示例#13
0
    def test_sync_reduction(self):
        # Set up process group.
        store = c10d.FileStore(self.file.name)
        process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        # Get this process' split of devices.
        devices = gpus_for_rank(self.world_size)[self.rank]
        grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) *
                       (self.rank + 1)).chunk(5)
                       for d in devices]
        work, local_grad_sum = c10d._queue_reduction(process_group,
                                                     grads_batch,
                                                     devices)
        c10d._sync_reduction(work, grads_batch[0], local_grad_sum)
        # The expected result of the allreduce should be the average
        self.assertEqual(grads_batch[0], (torch.ones(10) * (self.world_size + 1) / 2.0).chunk(5))
示例#14
0
def run_trainer(args, extra_args, model, data, rank, server_rref):
    trainer_class = get_benchmark_trainer_map()[str(args.trainer)]
    if extra_args is not None:
        trainer_args = extra_args.values()
    else:
        trainer_args = []
    trainer_count = args.ntrainer + args.ncudatrainer
    store = c10d.FileStore(args.filestore, trainer_count)
    if args.backend == "gloo":
        process_group = c10d.ProcessGroupGloo(store, rank, trainer_count)
    elif args.backend == "nccl":
        process_group = c10d.ProcessGroupNCCL(store, rank, trainer_count)
    use_cuda_rpc = rank >= args.ntrainer
    trainer = trainer_class(rank, args.ntrainer + args.ncudatrainer,
                            process_group, use_cuda_rpc, server_rref,
                            args.backend, args.epochs, *trainer_args)
    trainer.train(model, data)
    metrics = trainer.get_metrics()
    return [rank, metrics]
示例#15
0
    def test_broadcast_ops(self):
        store = c10d.FileStore(self.file.name)
        pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        def broadcast(xs, rootRank, rootTensor):
            opts = c10d.BroadcastOptions()
            opts.rootRank = rootRank
            opts.rootTensor = rootTensor
            work = pg.broadcast(xs, opts)
            work.wait()

        # for every root tensor
        for rt in range(self.num_gpus):
            tensors = []
            for i in range(self.num_gpus):
                tensors.append(torch.Tensor([i]).cuda(i))

            broadcast(tensors, self.rank, rt)

            for i in range(self.num_gpus):
                self.assertEqual(tensors[i], tensors[rt])
示例#16
0
    def test_ddp_comm_hook_noop_hook(self):
        """
        This unit test verifies the ``noop`` hook registered case and a subsequent allreduce
        gives same result with no hook registered case.
        """
        store = dist.FileStore(self.file_name, self.world_size)
        process_group = dist.ProcessGroupNCCL(store, self.rank,
                                              self.world_size)

        # No hook registered case, get the reference grads.
        reference_grads = self._get_grads(process_group, None)
        # Register hook case, get the hook grads.
        hook_grads = self._get_grads(process_group, DDPCommHookType.NOOP)
        # Apply a subsequent allreduce to average grads.
        hook_grads.div_(self.world_size)
        dist.all_reduce(hook_grads, group=process_group)

        torch.testing.assert_allclose(hook_grads,
                                      reference_grads,
                                      rtol=1e-5,
                                      atol=0)
示例#17
0
    def test_reduce_ops(self):
        store = c10d.FileStore(self.file.name)
        pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        def reduce(xs, rootRank, rootTensor):
            opts = c10d.ReduceOptions()
            opts.rootRank = rootRank
            opts.rootTensor = rootTensor
            work = pg.reduce(xs, opts)
            work.wait()

        # for every root tensor
        for rt in range(self.num_gpus):
            tensors = []
            for i in range(self.num_gpus):
                tensors.append(torch.Tensor([i + 1]).cuda(i))

            reduce(tensors, self.rank, rt)

            self.assertEqual(
                torch.Tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]),
                tensors[rt])
示例#18
0
    def test_allgather_ops(self):
        store = c10d.FileStore(self.file.name)
        pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)

        def allgather(output_ts, input_ts):
            work = pg.allgather(output_ts, input_ts)
            work.wait()

        tensors = []
        output_ts = [[] for _ in range(self.num_gpus)]

        for idx, ls in enumerate(output_ts):
            for _ in range(self.world_size * self.num_gpus):
                ls.append(torch.Tensor([0]).cuda(idx))

        for i in range(self.num_gpus):
            tensors.append(torch.Tensor([i]).cuda(i))

        allgather(output_ts, tensors)

        # Verification
        for device_ts in output_ts:
            for s_idx, t in enumerate(device_ts):
                self.assertEqual(torch.Tensor([s_idx]), t)
示例#19
0
 def test_nccl_backend(self):
     store = c10d.TCPStore('localhost', self.port, self.is_master)
     process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
     self._test_ddp_with_process_group(process_group)
示例#20
0
    parser = argparse.ArgumentParser(
        description='Simple script to simulate NCCL errors. The script is '
        'supposed to be run on multiple different nodes simultaneously with '
        'appropriate rank and world_size. The script run an allreduce() on '
        'the rank 0 node and aborts all the other nodes to simulate an error '
        'in NCCL')
    parser.add_argument('addr',
                        help='address of the master node to connect to.')
    parser.add_argument('port', help='port of the master node to connect to.')
    parser.add_argument('rank', help='rank of this node')
    parser.add_argument('world_size', help='number of nodes in process group')
    args = parser.parse_args()
    rank = int(args.rank)
    world_size = int(args.world_size)
    port = int(args.port)

    store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
    process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
    logging.info('Running first allreduce')
    process_group.allreduce(torch.rand(10).cuda(rank)).wait()
    if rank == 0:
        logging.info('Running second allreduce only on rank 0')
        work = process_group.allreduce(torch.rand(10).cuda(rank))
        logging.info('Waiting for allreduce to complete...')
        work.wait()
        logging.info('Second allreduce successful: {}'.format(
            work.is_success()))
    else:
        logging.info('Aborting all other ranks.')
        os.abort()
示例#21
0
 def _init_pg_nccl(cls, rank, filename, world_size):
     store = c10d.FileStore(filename, world_size)
     return c10d.ProcessGroupNCCL(store, rank, world_size)
示例#22
0
def run_trainer(
    args, extra_args, data, rank, server_rref
):
    r"""
    A function that runs obtains a trainer instance and calls
    the train method.
    Args:
        args (parser): benchmark configurations
        extra_args (dict): configurations added by the user
        data (list): training samples
        rank (int): process number in the world
        server_rrefs (dict): a dictionary containing server RRefs
    """
    trainer_class = trainer_map[args.trainer]
    if extra_args is not None:
        trainer_args = extra_args.values()
    else:
        trainer_args = []
    trainer_count = args.ntrainer + args.ncudatrainer
    store = c10d.FileStore(args.filestore, trainer_count)
    if args.backend == "gloo":
        process_group = c10d.ProcessGroupGloo(
            store, rank, trainer_count
        )
    elif args.backend == "nccl":
        process_group = c10d.ProcessGroupNCCL(
            store, rank, trainer_count
        )
    elif args.backend == "multi":
        process_group = c10d.ProcessGroupNCCL(
            store, rank, trainer_count
        )
        if c10d.is_initialized() is False:
            c10d.init_process_group(backend="gloo", rank=rank, world_size=trainer_count)

    model = load_model(args)
    preprocess_data = preprocess_data_map[args.preprocess_data]
    create_criterion = criterion_map[args.create_criterion]
    create_ddp_model = ddp_model_map[args.create_ddp_model]
    iteration_step = iteration_step_map[args.iteration_step]
    hook_state_class = hook_state_map[args.hook_state]
    hook = ddp_hook_map[args.ddp_hook]
    # check if this a cudatrainer
    use_cuda_rpc = rank >= args.ntrainer
    trainer = trainer_class(
        process_group,
        use_cuda_rpc,
        server_rref,
        args.backend,
        args.epochs,
        preprocess_data,
        create_criterion,
        create_ddp_model,
        hook_state_class,
        hook,
        iteration_step,
        *trainer_args
    )
    trainer.train(model, data)
    metrics = trainer.get_metrics()
    return [rank, metrics]