Пример #1
0
    def test_broadcast_ops(self):
        store = c10d.FileStore(self.file.name)
        pg = c10d.ProcessGroupGloo(store, self.rank, self.size)

        def broadcast(xs, rootRank, rootTensor):
            opts = c10d.BroadcastOptions()
            opts.rootRank = rootRank
            opts.rootTensor = rootTensor
            work = pg.broadcast(xs, opts)
            work.wait()

        # Every rank is root once, every tensor index is root once
        for i in range(self.size):
            for j in range(2):
                xs = [
                    torch.Tensor([self.rank * self.size + 0.0]),
                    torch.Tensor([self.rank * self.size + 1.0]),
                ]

                broadcast(xs, i, j)
                self.assertEqual(torch.Tensor([i * self.size + j]), xs[0])
                self.assertEqual(torch.Tensor([i * self.size + j]), xs[1])

        # Test overloaded convenience function
        x = torch.Tensor([self.rank + 1.0])
        work = pg.broadcast(x, root=0)
        work.wait()
        self.assertEqual(torch.Tensor([1.0]), x)
Пример #2
0
    def test_allreduce_ops(self):
        store = c10d.FileStore(self.file.name)
        pg = c10d.ProcessGroupGloo(store, self.rank, self.size)

        def allreduce(x, op):
            opts = c10d.AllreduceOptions()
            opts.reduceOp = op
            work = pg.allreduce([x], opts)
            work.wait()

        # Sum
        x = torch.Tensor([self.rank + 1.0])
        allreduce(x, c10d.ReduceOp.SUM)
        self.assertEqual(torch.Tensor([float(self.size * (self.size + 1) / 2)]), x)

        # Product
        x = torch.Tensor([self.rank + 1.0])
        allreduce(x, c10d.ReduceOp.PRODUCT)
        self.assertEqual(torch.Tensor([float(math.factorial(self.size))]), x)

        # Min
        x = torch.Tensor([self.rank + 1.0])
        allreduce(x, c10d.ReduceOp.MIN)
        self.assertEqual(torch.Tensor([1.0]), x)

        # Max
        x = torch.Tensor([self.rank + 1.0])
        allreduce(x, c10d.ReduceOp.MAX)
        self.assertEqual(torch.Tensor([self.size]), x)

        # Test overloaded convenience function (defaults to using sum)
        x = torch.Tensor([self.rank + 1.0])
        work = pg.allreduce(x)
        work.wait()
        self.assertEqual(torch.Tensor([float(self.size * (self.size + 1) / 2)]), x)
Пример #3
0
    def test_sync_params_no_buffers(self):
        # Set up process group.
        store = c10d.TCPStore('localhost', self.port, self.is_master)
        options = c10d.ProcessGroupGloo.Options()
        options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)

        # Use all available devices on every process here (data is small, so should be fine).
        devices = gpus_for_rank(self.world_size)[self.rank]
        target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5)
        parameter_data = [target]
        parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]]
        buffer_data = [[]] * len(parameter_data)

        c10d._sync_params(
            process_group,
            parameter_data=parameter_data,
            buffer_data=buffer_data,
            devices=devices,
            broadcast_bucket_size=10,
            broadcast_buffers=False)

        for device_data in parameter_data:
            for i, parameter in enumerate(device_data):
                self.assertEqual(parameter, target[i])
Пример #4
0
    def test_dist_broadcast_coalesced(self):
        # Set up process group.
        store = c10d.TCPStore('localhost', self.port, self.is_master)
        options = c10d.ProcessGroupGloo.Options()
        options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)

        device = torch.device('cuda')

        target = torch.arange(10, dtype=torch.float64, device=device).chunk(5)

        if self.is_master:
            # All processes should have these tensors in the end.
            tensors = target
        else:
            # Non-master processes start with empty tensors and should be
            # filled with the tensors from the master.
            tensors = torch.zeros(10, device=device).chunk(5)

        c10d._dist_broadcast_coalesced(
            tensors,
            buffer_size=10,
            process_group=process_group)

        if not self.is_master:
            self.assertEqual(tensors, target)
Пример #5
0
    def test_send_recv_all_to_all(self):
        store = c10d.FileStore(self.file.name)
        pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts())

        # Preallocate tensors for input/output
        inputs = [torch.Tensor([self.rank]) for _ in range(self.world_size)]
        outputs = [torch.Tensor([-1]) for _ in range(self.world_size)]

        # Issue sends
        send_work = []
        for i in range(self.world_size):
            if i == self.rank:
                continue
            send_work.append(pg.send([inputs[i]], i))

        # Issue recvs
        recv_work = []
        for i in range(self.world_size):
            if i == self.rank:
                continue
            recv_work.append(pg.recv([outputs[i]], i))

        # Wait for sends to complete
        for work in send_work:
            work.wait()

        # Wait for recvs to complete
        for work in recv_work:
            work.wait()

        # Test that every output other than our own contains the respective rank
        for i in range(self.world_size):
            if i == self.rank:
                continue
            self.assertEqual(torch.Tensor([i]), outputs[i])
 def test_gloo_backend(self):
     store = c10d.TCPStore('localhost', self.port, self.rank == 0)
     options = c10d.ProcessGroupGloo.Options()
     options.devices = [
         c10d.ProcessGroupGloo.create_tcp_device(interface="lo")
     ]
     process_group = c10d.ProcessGroupGloo(store, self.rank,
                                           self.world_size, options)
     self._test_ddp_with_process_group(process_group)
Пример #7
0
    def test_sync_params_with_buffers(self):
        # Set up process group.
        store = c10d.TCPStore('localhost', self.port, self.is_master)
        options = c10d.ProcessGroupGloo.Options()
        options.devices = [
            c10d.ProcessGroupGloo.create_tcp_device(interface="lo")
        ]
        process_group = c10d.ProcessGroupGloo(store, self.rank,
                                              self.world_size, options)

        devices = gpus_for_rank(self.world_size)[self.rank]
        target = torch.arange(10, dtype=torch.float64,
                              device='cuda:0').chunk(5)
        parameter_data = [target]
        parameter_data += [
            torch.zeros(10, device=torch.device('cuda', d)).chunk(5)
            for d in devices[1:]
        ]

        # sync_params should do a dist_broadcast for buffers, so we only populate the master buffers and
        # then check that other processes' tensors end up matching.

        if self.is_master:
            buffer_data = [target]
            buffer_data += [
                torch.zeros(10, device=torch.device('cuda', d)).chunk(5)
                for d in devices[1:]
            ]
        else:
            buffer_data = [
                torch.zeros(10, device=torch.device('cuda', d)).chunk(5)
                for d in devices
            ]

        c10d._sync_params(process_group,
                          parameter_data=parameter_data,
                          buffer_data=buffer_data,
                          devices=devices,
                          broadcast_bucket_size=10,
                          broadcast_buffers=True)

        for device_data in parameter_data:
            for i, parameter in enumerate(device_data):
                self.assertEqual(parameter, target[i])

        for device_data in buffer_data:
            for i, buffer in enumerate(device_data):
                self.assertEqual(buffer, target[i])