def test_allreduce_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) def allreduce(x, op): opts = c10d.AllreduceOptions() opts.reduceOp = op work = pg.allreduce([x], opts) work.wait() # Sum x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.SUM) self.assertEqual(torch.Tensor([float(self.world_size * (self.world_size + 1) / 2)]), x) # Product x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.PRODUCT) self.assertEqual(torch.Tensor([float(math.factorial(self.world_size))]), x) # Min x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.MIN) self.assertEqual(torch.Tensor([1.0]), x) # Max x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.MAX) self.assertEqual(torch.Tensor([self.world_size]), x) # Test overloaded convenience function (defaults to using sum) x = torch.Tensor([self.rank + 1.0]) work = pg.allreduce(x) work.wait() self.assertEqual(torch.Tensor([float(self.world_size * (self.world_size + 1) / 2)]), x)
def test_duplicated_names(self): store = dist.FileStore(self.file.name, self.world_size) dist.init_process_group(backend="gloo", rank=self.rank, world_size=self.world_size, store=store) with self.assertRaisesRegex(RuntimeError, "is not unique"): dist.init_model_parallel("duplicated_name") dist.join_rpc()
def wrapper(self): store = dist.FileStore(self.file.name, self.world_size) dist.init_process_group(backend='gloo', rank=self.rank, world_size=self.world_size, store=store) dist.init_rpc('worker{}'.format(self.rank)) func(self) dist.join_rpc()
def test_all_gather_base(self): store = c10d.FileStore(self.file_name, self.world_size) c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl') device = torch.device(f"cuda:{self.rank}") x = torch.ones(5, 5, device=device) + self.rank x.requires_grad = True output = torch.empty(5 * self.world_size, 5, device=device) output = torch.distributed.nn.functional._all_gather_base( output, x) self.assertEqual(output.size(), torch.Size( (5 * self.world_size, 5))) for idx in range(self.world_size): self.assertEqual(output[5 * idx:5 * (idx + 1)], torch.ones(5, 5, device=device) + idx) y = torch.sum(output.view(self.world_size, 5, 5), axis=0) z = y.sin().sum() z.backward() x_s = 2 * (3 * torch.ones(5, 5, device=device)).cos() self.assertEqual(x.grad, x_s)
def test_send_recv_all_to_all(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) # Preallocate tensors for input/output inputs = [torch.Tensor([self.rank]) for _ in range(self.world_size)] outputs = [torch.Tensor([-1]) for _ in range(self.world_size)] # Issue sends send_work = [] for i in range(self.world_size): if i == self.rank: continue send_work.append(pg.send([inputs[i]], i, 0)) # Issue recvs recv_work = [] for i in range(self.world_size): if i == self.rank: continue recv_work.append(pg.recv([outputs[i]], i, 0)) # Wait for sends to complete for work in send_work: work.wait() # Wait for recvs to complete for work in recv_work: work.wait() # Test that every output other than our own contains the respective rank for i in range(self.world_size): if i == self.rank: continue self.assertEqual(torch.Tensor([i]), outputs[i])
def _create_wrapper_pg(self, with_new_group=False, timeout=10.0): store = c10d.FileStore(self.file_name, self.world_size) c10d.init_process_group( backend="nccl", rank=self.rank, world_size=self.world_size, store=store, timeout=timedelta(seconds=timeout), ) if with_new_group: pg = c10d.new_group(backend="nccl", timeout=timedelta(seconds=timeout)) else: _pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size, timeout=timedelta(seconds=timeout)) pg = c10d._create_process_group_wrapper( _pg, "unused", store, self.rank, self.world_size, timeout=timeout, ) return pg
def test_invalid_names(self): store = dist.FileStore(self.file.name, self.world_size) dist.init_process_group(backend="gloo", rank=self.rank, world_size=self.world_size, store=store) with self.assertRaisesRegex(RuntimeError, "Worker name must match"): dist.init_model_parallel(self_name="abc*") with self.assertRaisesRegex(RuntimeError, "Worker name must match"): dist.init_model_parallel(self_name=" ") with self.assertRaisesRegex(RuntimeError, "must be non-empty"): dist.init_model_parallel(self_name="") # If the number in the message does not match, it is likely that the # value of MAX_NAME_LEN in RPC WorkerId has changed. with self.assertRaisesRegex(RuntimeError, "shorter than 128"): dist.init_model_parallel(self_name="".join( ["a" for _ in range(500)]), backend=BACKEND, self_rank=self.rank, init_method=RPC_INIT_URL) dist.join_rpc()
def test_queue_reduction(self): # Set up process group. store = c10d.FileStore(self.file.name) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) # Get this process' split of devices. devices = gpus_for_rank(self.world_size)[self.rank] grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) * (self.rank + 1)).chunk(5) for d in devices] work, local_grad_sum = c10d._queue_reduction(process_group, grads_batch, devices) # The first return value should be the allreduce work item. self.assertTrue(isinstance(work, c10d.Work)) # The second return value will be the finished allreduced gradients. self.assertTrue(isinstance(local_grad_sum, torch.Tensor)) # Wait for the allreduce to finish. work.wait() # The expected result of the allreduce should be the average self.assertEqual(local_grad_sum, torch.ones(10) * (self.world_size + 1) / 2.0)
def test_scatter(self): store = c10d.FileStore(self.file_name, self.world_size) # This is required because these functions calls directly to the .dist and needs # the world to be initialized c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') device = torch.device(f"cuda:{self.rank}") x0 = torch.ones(5, 5, device=device) x1 = torch.ones(5, 5, device=device) + 1 x0.requires_grad = True x1.requires_grad = True y = torch.distributed.nn.scatter([x0, x1], 1) if self.rank == 1: self.assertEqual(y, 1 + torch.ones(5, 5, device=device)) elif self.rank == 0: self.assertEqual(y, torch.ones(5, 5, device=device)) z = y.sin().sum() z.backward() # Test gradient if self.rank == 1: x0_s = torch.ones(5, 5, device=device).cos() x1_s = (2 * torch.ones(5, 5, device=device)).cos() self.assertEqual(x0.grad, x0_s) self.assertEqual(x1.grad, x1_s) if self.rank == 0: self.assertEqual(x0.grad, torch.zeros(5, 5, device=device))
def test_reduce_scatter_non_contiguous(self): store = c10d.FileStore(self.file_name, self.world_size) # This is required because these functions calls directly to the .dist and needs # the world to be initialized c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl') device = torch.device(f"cuda:{self.rank}") class NonContiguousGrad(torch.autograd.Function): @staticmethod def forward(ctx, input): return input @staticmethod def backward(ctx, grad_output): # Make grad non-contiguous return grad_output.clone().transpose(0, 1) x0 = torch.rand(5, 5, device=device, requires_grad=True) x1 = torch.rand(5, 5, device=device, requires_grad=True) y = torch.empty(5, 5, device=device) y = torch.distributed.nn.reduce_scatter(y, [x0, x1]) NonContiguousGrad.apply(y).sum().backward()
def test_fp16(self): store = c10d.FileStore(self.file.name) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) gpus = gpus_for_rank(self.world_size)[self.rank] model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half() nn.init.constant_(model.weight, 1) ddp_model = DistributedDataParallel( model, device_ids=[gpus[0]], process_group=process_group, bucket_cap_mb=1, ) # Input 2**15, so that the gradients will overflow with a # world_size of 2, unless we normalize the gradient by the # world_size before the reduction input = torch.Tensor([[2**15]]).cuda(gpus[0]).half() # Step model ddp_model.train() output = ddp_model(input) loss = output.sum() loss.backward() self.assertFalse( any(torch.isinf(p.grad).any() for p in ddp_model.parameters()))
def test_dist_broadcast_coalesced(self): store = c10d.FileStore(self.file.name) options = c10d.ProcessGroupGloo.Options() options.devices = [ c10d.ProcessGroupGloo.create_tcp_device(interface="lo") ] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) device = torch.device('cuda') target = torch.arange(10, dtype=torch.float64, device=device).chunk(5) if self.is_master: # All processes should have these tensors in the end. tensors = target else: # Non-master processes start with empty tensors and should be # filled with the tensors from the master. tensors = torch.zeros(10, device=device).chunk(5) c10d._dist_broadcast_coalesced(process_group, tensors, buffer_size=10) if not self.is_master: self.assertEqual(tensors, target)
def _test_base(self, net, inp, check_allclose=True): store = c10d.FileStore(self.file.name, self.world_size) process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size) ddp = nn.parallel.DistributedDataParallel(copy.deepcopy(net), process_group=process_group) net_opt = torch.optim.Adam(net.parameters(), lr=0.001) ddp_opt = torch.optim.Adam(ddp.parameters(), lr=0.001) for i, j in zip(ddp.parameters(), net.parameters()): self.assertTrue(i.allclose(j)) for _ in range(10): net_out = net(*inp) ddp_out = ddp(*inp) net_out.sum().backward() ddp_out.sum().backward() net_opt.step() ddp_opt.step() if check_allclose: for i, j in zip(ddp.parameters(), net.parameters()): self.assertTrue(i.allclose(j))
def test_reduce_scatter(self): store = c10d.FileStore(self.file_name, self.world_size) # This is required because these functions calls directly to the .dist and needs # the world to be initialized c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='nccl') device = torch.device(f"cuda:{self.rank}") x0 = torch.ones(5, 5, device=device) + self.rank x1 = torch.ones(5, 5, device=device) + self.rank + 1 x0.requires_grad = True x1.requires_grad = True y = torch.empty_like(x0) expected = (1 + self.world_size ) * self.world_size / 2 + self.world_size * self.rank y = torch.distributed.nn.reduce_scatter(y, [x0, x1]) self.assertEqual(y, torch.ones(5, 5, device=device) * expected) z = y.sin().sum() z.backward() expected_0 = (1 + self.world_size) * self.world_size / 2 expected_1 = expected_0 + self.world_size x_s_0 = (expected_0 * torch.ones(5, 5, device=device)).cos() x_s_1 = (expected_1 * torch.ones(5, 5, device=device)).cos() self.assertEqual(x0.grad, x_s_0) self.assertEqual(x1.grad, x_s_1)
def test_gather(self): store = c10d.FileStore(self.file_name, self.world_size) # This is required because these functions calls directly to the .dist and needs # the world to be initialized c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') device = torch.device(f"cuda:{self.rank}") x = torch.ones(5, 5, device=device) + self.rank x.requires_grad = True tensors = torch.distributed.nn.gather(x, 1) if self.rank == 1: for i, t in enumerate(tensors): self.assertEqual(t, torch.ones(5, 5, device=device) + i) elif self.rank == 0: for i, t in enumerate(tensors): zeros = torch.zeros(5, 5, device=device) self.assertEqual(t, zeros) y = torch.sum(torch.stack(tensors), axis=0) z = y.sin().sum() z.backward() # Test gradient x_s = 3 * torch.ones(5, 5, device=device) self.assertEqual(x.grad, x_s.cos())
def test_broadcast_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) def broadcast(xs, rootRank, rootTensor): opts = c10d.BroadcastOptions() opts.rootRank = rootRank opts.rootTensor = rootTensor work = pg.broadcast(xs, opts) work.wait() # Every rank is root once, every tensor index is root once for i in range(self.world_size): for j in range(2): xs = [ torch.Tensor([self.rank * self.world_size + 0.0]), torch.Tensor([self.rank * self.world_size + 1.0]), ] broadcast(xs, i, j) self.assertEqual(torch.Tensor([i * self.world_size + j]), xs[0]) self.assertEqual(torch.Tensor([i * self.world_size + j]), xs[1]) # Test overloaded convenience function x = torch.Tensor([self.rank + 1.0]) work = pg.broadcast(x, root=0) work.wait() self.assertEqual(torch.Tensor([1.0]), x)
def test_sync_params_with_buffers(self): store = c10d.FileStore(self.file.name) options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) devices = gpus_for_rank(self.world_size)[self.rank] target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5) parameter_data = [target] parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]] # sync_params should do a dist_broadcast for buffers, so we only populate the master buffers and # then check that other processes' tensors end up matching. if self.is_master: buffer_data = [target] buffer_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]] else: buffer_data = [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices] c10d._sync_params( process_group, parameter_data=parameter_data, buffer_data=buffer_data, devices=devices, broadcast_bucket_size=10, broadcast_buffers=True) for device_data in parameter_data: for i, parameter in enumerate(device_data): self.assertEqual(parameter, target[i]) for device_data in buffer_data: for i, buffer in enumerate(device_data): self.assertEqual(buffer, target[i])
def test_is_last_hook(self): store = dist.FileStore(self.file_name, self.world_size) process_group = dist.ProcessGroupNCCL(store, self.rank, self.world_size) def hook(flags, bucket): flags.append(bucket.is_last()) fut = torch.futures.Future() fut.set_result(bucket.buffer()) return fut flags = [] device_id = gpus_for_rank(self.world_size)[self.rank][0] model = nn.Sequential( nn.Linear(2, 4000, bias=False), *[nn.Linear(4000, 4000, bias=False) for _ in range(10)]) gpu_model = DistributedDataParallel( model.to(device_id), device_ids=[device_id], process_group=process_group, ) gpu_model.register_comm_hook(state=flags, hook=hook) input = torch.randn(10, 2) gpu_model(input).sum().backward() self.assertTrue(flags[-1]) self.assertFalse(any(flags[:-1]))
def dist_init(self, rank, world_size=-1): store = dist.FileStore( self.file_name, self.world_size if world_size < 1 else world_size) return dist.init_process_group(backend=BACKEND, store=store, rank=rank, world_size=self.world_size)
def test_all_gather_bfp16(self): store = dist.FileStore(self.file_name, self.world_size) dist.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend='gloo') device = torch.device(f"cuda:{self.rank}") group = list(range(0, self.world_size)) group_id = dist.group.WORLD self._test_all_gather(group, group_id, self.rank, dtype=torch.float32, qtype=DQuantType.BFP16)
def _test_warn_not_in_group(self, backend): store = dist.FileStore(self.file_name, self.world_size) dist.init_process_group( backend, world_size=self.world_size, rank=self.rank, store=store, ) in_group_ranks = list( filter(lambda x: x % 2 == 0, range(self.world_size))) group = dist.new_group(in_group_ranks) x = torch.zeros(2, 2).cuda(self.rank) xs = [ torch.zeros(2, 2).cuda(self.rank) for _ in range(len(in_group_ranks)) ] if self.rank not in in_group_ranks: msg = ".*{}.*does not belong to.*" with self.assertWarnsOnceRegex(UserWarning, msg.format("all_gather")): dist.all_gather(xs, x, group=group) with self.assertWarnsOnceRegex(UserWarning, msg.format("all_reduce")): dist.all_reduce(x, group=group) with self.assertWarnsOnceRegex(UserWarning, msg.format("barrier")): dist.barrier(group=group) with self.assertWarnsOnceRegex(UserWarning, msg.format("broadcast")): dist.broadcast(x, src=0, group=group) else: dist.all_gather(xs, x, group=group) dist.all_reduce(x, group=group) dist.barrier(group=group) dist.broadcast(x, src=0, group=group)
def _test_all_to_all_single(self, backend): store = c10d.FileStore(self.file_name, self.world_size) # This is required because these functions calls directly to the .dist and needs # the world to be initialized c10d.init_process_group(store=store, rank=self.rank, world_size=self.world_size, backend=backend) device = torch.device(f"cuda:{self.rank}") row = self.world_size * (self.rank + 1) * (self.world_size + 1) / 2 x = torch.ones(int(row), 5, device=device) * (self.rank + 1) x.requires_grad = True y = torch.empty_like(x) split_sizes = [(i + 1) * (self.rank + 1) for i in range(self.world_size)] y = torch.distributed.nn.all_to_all_single( y, x, output_split_sizes=split_sizes, input_split_sizes=split_sizes) expected = [] for idx, tensor in enumerate(torch.split(x, split_sizes)): expected.append(torch.full_like(tensor, (idx + 1))) expected = torch.cat(expected) self.assertEqual(y, expected) z = y.sin().sum() z.backward() x_s = ((self.rank + 1) * torch.ones(int(row), 5, device=device)).cos() self.assertEqual(x.grad, x_s)
def test_sync_params_no_buffers(self): store = c10d.FileStore(self.file.name) options = c10d.ProcessGroupGloo.Options() options.devices = [ c10d.ProcessGroupGloo.create_tcp_device(interface="lo") ] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) # Use all available devices on every process here (data is small, so should be fine). devices = gpus_for_rank(self.world_size)[self.rank] target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5) parameter_data = [target] parameter_data += [ torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:] ] buffer_data = [[]] * len(parameter_data) c10d._sync_params(process_group, parameter_data=parameter_data, buffer_data=buffer_data, devices=devices, broadcast_bucket_size=10, broadcast_buffers=False) for device_data in parameter_data: for i, parameter in enumerate(device_data): self.assertEqual(parameter, target[i])
def wrapper(self): store = dist.FileStore(self.file.name, self.world_size) dist.init_process_group(backend='gloo', rank=self.rank, world_size=self.world_size, store=store) dist.init_model_parallel('worker%d' % self.rank) func(self) dist.join_rpc()
def dist_init(self, rank, world_size=-1, backend=BACKEND): if (world_size < 1): world_size = self.world_size store = dist.FileStore(self.file_name, world_size) return dist.init_process_group(backend=backend, store=store, rank=rank, world_size=world_size)
def test_gloo_backend(self): store = c10d.FileStore(self.file.name) options = c10d.ProcessGroupGloo.Options() options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")] process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options) gpus = gpus_for_rank(self.world_size)[self.rank] self._test_ddp_with_process_group(process_group, gpus) self._test_ddp_with_process_group(process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
def train(self, model, data): torch.manual_seed(0) model = model.cuda(self.rank) for i in range(len(data)): data[i][0] = data[i][0].cuda(self.rank) data[i][1] = data[i][1].cuda(self.rank) torch.cuda.synchronize(self.rank) process_group_size = self.trainer_count store = c10d.FileStore("/tmp/tmpn_k_8so02", process_group_size) process_group = c10d.ProcessGroupNCCL(store, self.rank, process_group_size) ddp_model = DDP(model, device_ids=[self.rank], process_group=process_group) hook_state = self.HookState(self, process_group) ddp_model.register_comm_hook(hook_state, DdpNcclTrainer.hook) criterion = nn.CrossEntropyLoss().cuda(self.rank) optimizer = torch.optim.SGD(ddp_model.parameters(), 1e-4) def epoch_key(epoch, index): return f"{epoch},{index}" for epoch in range(self.epochs): for index, batch in enumerate(data): hook_state.next_batch_state() input, target = batch[0], batch[1] self.record_batch_start(epoch_key(epoch, index)) optimizer.zero_grad() self.record_forward_start(epoch_key(epoch, index)) out = ddp_model(input) self.record_forward_end(epoch_key(epoch, index)) loss = criterion(out, target) self.record_backward_start(epoch_key(epoch, index)) loss.backward() self.record_backward_end(epoch_key(epoch, index)) optimizer.step() self.record_batch_end(epoch_key(epoch, index)) torch.cuda.synchronize(self.rank)
def test_nccl_backend(self): store = c10d.FileStore(self.file.name) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) gpus = gpus_for_rank(self.world_size)[self.rank] self._test_ddp_with_process_group(process_group, gpus) self._test_ddp_with_process_group( process_group, list(map(lambda i: torch.device('cuda:' + str(i)), gpus)))
def test_reinit(self): store = dist.FileStore(self.file.name, self.world_size) dist.init_process_group(backend="gloo", rank=self.rank, world_size=self.world_size, store=store) with self.assertRaisesRegex(RuntimeError, "is not unique"): dist.init_model_parallel(self_name="duplicate_name", backend=BACKEND, self_rank=self.rank, init_method=RPC_INIT_URL) dist.join_rpc()
def wrapper(self): store = dist.FileStore(self.file.name, self.world_size) dist.init_process_group(backend='gloo', rank=self.rank, world_size=self.world_size, store=store) dist.init_model_parallel(self_name='worker%d' % self.rank, backend=BACKEND, self_rank=self.rank, init_method=RPC_INIT_URL) func(self) dist.join_rpc()