def test_broadcast_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.size) def broadcast(xs, rootRank, rootTensor): opts = c10d.BroadcastOptions() opts.rootRank = rootRank opts.rootTensor = rootTensor work = pg.broadcast(xs, opts) work.wait() # Every rank is root once, every tensor index is root once for i in range(self.size): for j in range(2): xs = [ torch.Tensor([self.rank * self.size + 0.0]), torch.Tensor([self.rank * self.size + 1.0]), ] broadcast(xs, i, j) self.assertEqual(torch.Tensor([i * self.size + j]), xs[0]) self.assertEqual(torch.Tensor([i * self.size + j]), xs[1]) # Test overloaded convenience function x = torch.Tensor([self.rank + 1.0]) work = pg.broadcast(x, root=0) work.wait() self.assertEqual(torch.Tensor([1.0]), x)
def test_allreduce_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.size) def allreduce(x, op): opts = c10d.AllreduceOptions() opts.reduceOp = op work = pg.allreduce([x], opts) work.wait() # Sum x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.SUM) self.assertEqual(torch.Tensor([float(self.size * (self.size + 1) / 2)]), x) # Product x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.PRODUCT) self.assertEqual(torch.Tensor([float(math.factorial(self.size))]), x) # Min x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.MIN) self.assertEqual(torch.Tensor([1.0]), x) # Max x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.MAX) self.assertEqual(torch.Tensor([self.size]), x) # Test overloaded convenience function (defaults to using sum) x = torch.Tensor([self.rank + 1.0]) work = pg.allreduce(x) work.wait() self.assertEqual(torch.Tensor([float(self.size * (self.size + 1) / 2)]), x)
def test_sync_params_no_buffers(self): # Set up process group. store = c10d.TCPStore('localhost', self.port, self.is_master) options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) # Use all available devices on every process here (data is small, so should be fine). devices = gpus_for_rank(self.world_size)[self.rank] target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5) parameter_data = [target] parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]] buffer_data = [[]] * len(parameter_data) c10d._sync_params( process_group, parameter_data=parameter_data, buffer_data=buffer_data, devices=devices, broadcast_bucket_size=10, broadcast_buffers=False) for device_data in parameter_data: for i, parameter in enumerate(device_data): self.assertEqual(parameter, target[i])
def test_dist_broadcast_coalesced(self): # Set up process group. store = c10d.TCPStore('localhost', self.port, self.is_master) options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device('cuda') target = torch.arange(10, dtype=torch.float64, device=device).chunk(5) if self.is_master: # All processes should have these tensors in the end. tensors = target else: # Non-master processes start with empty tensors and should be # filled with the tensors from the master. tensors = torch.zeros(10, device=device).chunk(5) c10d._dist_broadcast_coalesced( tensors, buffer_size=10, process_group=process_group) if not self.is_master: self.assertEqual(tensors, target)
def test_send_recv_all_to_all(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) # Preallocate tensors for input/output inputs = [torch.Tensor([self.rank]) for _ in range(self.world_size)] outputs = [torch.Tensor([-1]) for _ in range(self.world_size)] # Issue sends send_work = [] for i in range(self.world_size): if i == self.rank: continue send_work.append(pg.send([inputs[i]], i)) # Issue recvs recv_work = [] for i in range(self.world_size): if i == self.rank: continue recv_work.append(pg.recv([outputs[i]], i)) # Wait for sends to complete for work in send_work: work.wait() # Wait for recvs to complete for work in recv_work: work.wait() # Test that every output other than our own contains the respective rank for i in range(self.world_size): if i == self.rank: continue self.assertEqual(torch.Tensor([i]), outputs[i])
def test_gloo_backend(self): store = c10d.TCPStore('localhost', self.port, self.rank == 0) options = c10d.ProcessGroupGloo.Options() options.devices = [ c10d.ProcessGroupGloo.create_tcp_device(interface="lo") ] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) self._test_ddp_with_process_group(process_group)
def test_sync_params_with_buffers(self): # Set up process group. store = c10d.TCPStore('localhost', self.port, self.is_master) options = c10d.ProcessGroupGloo.Options() options.devices = [ c10d.ProcessGroupGloo.create_tcp_device(interface="lo") ] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) devices = gpus_for_rank(self.world_size)[self.rank] target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5) parameter_data = [target] parameter_data += [ torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:] ] # sync_params should do a dist_broadcast for buffers, so we only populate the master buffers and # then check that other processes' tensors end up matching. if self.is_master: buffer_data = [target] buffer_data += [ torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:] ] else: buffer_data = [ torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices ] c10d._sync_params(process_group, parameter_data=parameter_data, buffer_data=buffer_data, devices=devices, broadcast_bucket_size=10, broadcast_buffers=True) for device_data in parameter_data: for i, parameter in enumerate(device_data): self.assertEqual(parameter, target[i]) for device_data in buffer_data: for i, buffer in enumerate(device_data): self.assertEqual(buffer, target[i])