def test_all_reduce(self): tensors = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)] expected = torch.FloatTensor(128).zero_() for t in tensors: expected.add_(t) tensors = [tensors[i].cuda(i) for i in range(nGPUs)] nccl.all_reduce(tensors) for tensor in tensors: self.assertEqual(tensor, expected)
def test_all_reduce(self): tensors = [torch.FloatTensor(128).uniform_() for i in range(nGPUs)] expected = torch.FloatTensor(128).zero_() for t in tensors: expected.add_(t) tensors = [tensors[i].cuda(i) for i in range(nGPUs)] nccl.all_reduce(tensors) for tensor in tensors: self.assertEqual(tensor, expected)
def nccl_all_reduce(inputs): # TODO, figure out why nccl all_reduce doesn't work for gradcheck input_size = inputs[0].size() #if nccl.is_available(inputs): for i, inp in enumerate(inputs): assert inp.is_cuda, \ "reduce_add expects all inputs to be on GPUs" if inp.size() != input_size: got = 'x'.join(str(x) for x in inp.size()) expected = 'x'.join(str(x) for x in input_size) raise ValueError("input {} has invalid size: got {}, \ but expected {}".format(i, got, expected)) nccl.all_reduce(inputs) return inputs
def test_all_reduce(self, device, dtype): if TEST_WITH_ROCM and HIP_VERSION < 3.5 and dtype == torch.bfloat16: raise unittest.SkipTest("Skip bfloat16 test for ROCm < 3.5") tensors = [torch.zeros(128).uniform_().to(dtype=dtype) for i in range(nGPUs)] expected = torch.zeros(128, dtype=dtype) for t in tensors: expected.add_(t) tensors = [tensors[i].cuda(i) for i in range(nGPUs)] nccl.all_reduce(tensors) for tensor in tensors: self.assertEqual(tensor, expected)
def test_collective_errors(self, device): t = torch.rand(10).cuda(0) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.all_reduce(t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.reduce(t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.broadcast(t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.all_gather(t, t) with self.assertRaisesRegex( TypeError, "Inputs should be a collection of tensors"): nccl.reduce_scatter(t, t)
def test_all_reduce(self, device, dtype): cpu_tensors = [ torch.zeros(128).uniform_().to(dtype=dtype) for i in range(nGPUs) ] expected = torch.zeros(128, dtype=dtype) for t in cpu_tensors: expected.add_(t) tensors = [cpu_tensors[i].cuda(i) for i in range(nGPUs)] nccl.all_reduce(tensors) for tensor in tensors: self.assertEqual(tensor, expected) # Test with tuple. tensors = tuple(cpu_tensors[i].cuda(i) for i in range(nGPUs)) nccl.all_reduce(tensors) for tensor in tensors: self.assertEqual(tensor, expected) # Test with set. tensors = {cpu_tensors[i].cuda(i) for i in range(nGPUs)} nccl.all_reduce(tensors) for tensor in tensors: self.assertEqual(tensor, expected)