def test_queue_reduction(self): # Set up process group. store = c10d.FileStore(self.file.name) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) # Get this process' split of devices. devices = gpus_for_rank(self.world_size)[self.rank] grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) * (self.rank + 1)).chunk(5) for d in devices] work, local_grad_sum = c10d._queue_reduction(process_group, grads_batch, devices) # The first return value should be the allreduce work item. self.assertTrue(isinstance(work, c10d.Work)) # The second return value will be the finished allreduced gradients. self.assertTrue(isinstance(local_grad_sum, torch.Tensor)) # Wait for the allreduce to finish. work.wait() # The expected result of the allreduce should be the average self.assertEqual(local_grad_sum, torch.ones(10) * (self.world_size + 1) / 2.0)
def _queue_reduction(self, bucket_idx): # _queue_reduction will use a seperate CUDA stream to coalesce # the small tensors to achieve more parallelisms, before passing the # coalesced tensor into the c10d CUDA stream for reduction result = dist._queue_reduction(self.process_group, self.buckets[bucket_idx], self.device_ids) self.reduction_works[bucket_idx] = result[0] self.buckets_coalesced[bucket_idx] = result[1]
def test_sync_reduction(self): # Set up process group. store = c10d.FileStore(self.file.name) process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) # Get this process' split of devices. devices = gpus_for_rank(self.world_size)[self.rank] grads_batch = [(torch.ones(10, device=torch.device('cuda', d)) * (self.rank + 1)).chunk(5) for d in devices] work, local_grad_sum = c10d._queue_reduction(process_group, grads_batch, devices) c10d._sync_reduction(work, grads_batch[0], local_grad_sum) # The expected result of the allreduce should be the average self.assertEqual(grads_batch[0], (torch.ones(10) * (self.world_size + 1) / 2.0).chunk(5))
def _queue_reduction(self, bucket_idx): result = dist._queue_reduction(self.process_group, self.buckets[bucket_idx], self.device_ids) self.reduction_works[bucket_idx] = result[0] self.buckets_coalesced[bucket_idx] = result[1]