def test_allreduce_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.size) def allreduce(x, op): opts = c10d.AllreduceOptions() opts.reduceOp = op work = pg.allreduce([x], opts) work.wait() # Sum x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.SUM) self.assertEqual(torch.Tensor([float(self.size * (self.size + 1) / 2)]), x) # Product x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.PRODUCT) self.assertEqual(torch.Tensor([float(math.factorial(self.size))]), x) # Min x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.MIN) self.assertEqual(torch.Tensor([1.0]), x) # Max x = torch.Tensor([self.rank + 1.0]) allreduce(x, c10d.ReduceOp.MAX) self.assertEqual(torch.Tensor([self.size]), x) # Test overloaded convenience function (defaults to using sum) x = torch.Tensor([self.rank + 1.0]) work = pg.allreduce(x) work.wait() self.assertEqual(torch.Tensor([float(self.size * (self.size + 1) / 2)]), x)
def test_send_recv_all_to_all(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.world_size, self.opts()) # Preallocate tensors for input/output inputs = [torch.Tensor([self.rank]) for _ in range(self.world_size)] outputs = [torch.Tensor([-1]) for _ in range(self.world_size)] # Issue sends send_work = [] for i in range(self.world_size): if i == self.rank: continue send_work.append(pg.send([inputs[i]], i)) # Issue recvs recv_work = [] for i in range(self.world_size): if i == self.rank: continue recv_work.append(pg.recv([outputs[i]], i)) # Wait for sends to complete for work in send_work: work.wait() # Wait for recvs to complete for work in recv_work: work.wait() # Test that every output other than our own contains the respective rank for i in range(self.world_size): if i == self.rank: continue self.assertEqual(torch.Tensor([i]), outputs[i])
def test_broadcast_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupGloo(store, self.rank, self.size) def broadcast(xs, rootRank, rootTensor): opts = c10d.BroadcastOptions() opts.rootRank = rootRank opts.rootTensor = rootTensor work = pg.broadcast(xs, opts) work.wait() # Every rank is root once, every tensor index is root once for i in range(self.size): for j in range(2): xs = [ torch.Tensor([self.rank * self.size + 0.0]), torch.Tensor([self.rank * self.size + 1.0]), ] broadcast(xs, i, j) self.assertEqual(torch.Tensor([i * self.size + j]), xs[0]) self.assertEqual(torch.Tensor([i * self.size + j]), xs[1]) # Test overloaded convenience function x = torch.Tensor([self.rank + 1.0]) work = pg.broadcast(x, root=0) work.wait() self.assertEqual(torch.Tensor([1.0]), x)
def test_allreduce_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) def allreduce(tensors, op): opts = c10d.AllreduceOptions() opts.reduceOp = op work = pg.allreduce(tensors, opts) work.wait() # Sum tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i + 1]).cuda(i)) allreduce(tensors, c10d.ReduceOp.SUM) for i in range(self.num_gpus): self.assertEqual( torch.Tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]), tensors[i]) # Product tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i + 1]).cuda(i)) allreduce(tensors, c10d.ReduceOp.PRODUCT) for i in range(self.num_gpus): self.assertEqual( torch.Tensor([float(math.factorial(self.num_gpus))]), tensors[i]) # Min tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i + 1]).cuda(i)) allreduce(tensors, c10d.ReduceOp.MIN) for i in range(self.num_gpus): self.assertEqual(torch.Tensor([1.0]), tensors[i]) # Max tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i + 1]).cuda(i)) allreduce(tensors, c10d.ReduceOp.MAX) for i in range(self.num_gpus): self.assertEqual(torch.Tensor([self.num_gpus]), tensors[i])
def test_broadcast_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) def broadcast(xs, rootRank, rootTensor): opts = c10d.BroadcastOptions() opts.rootRank = rootRank opts.rootTensor = rootTensor work = pg.broadcast(xs, opts) work.wait() # for every root tensor for rt in range(self.num_gpus): tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i]).cuda(i)) broadcast(tensors, self.rank, rt) for i in range(self.num_gpus): self.assertEqual(tensors[i], tensors[rt])
def test_reduce_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) def reduce(xs, rootRank, rootTensor): opts = c10d.ReduceOptions() opts.rootRank = rootRank opts.rootTensor = rootTensor work = pg.reduce(xs, opts) work.wait() # for every root tensor for rt in range(self.num_gpus): tensors = [] for i in range(self.num_gpus): tensors.append(torch.Tensor([i + 1]).cuda(i)) reduce(tensors, self.rank, rt) self.assertEqual( torch.Tensor([float(self.num_gpus * (self.num_gpus + 1) / 2)]), tensors[rt])
def test_allgather_ops(self): store = c10d.FileStore(self.file.name) pg = c10d.ProcessGroupNCCL(store, self.rank, self.world_size) def allgather(output_ts, input_ts): work = pg.allgather(output_ts, input_ts) work.wait() tensors = [] output_ts = [[] for _ in range(self.num_gpus)] for idx, ls in enumerate(output_ts): for _ in range(self.world_size * self.num_gpus): ls.append(torch.Tensor([0]).cuda(idx)) for i in range(self.num_gpus): tensors.append(torch.Tensor([i]).cuda(i)) allgather(output_ts, tensors) # Verification for device_ts in output_ts: for s_idx, t in enumerate(device_ts): self.assertEqual(torch.Tensor([s_idx]), t)
def _create_store(self): return c10d.FileStore(self.file.name)
def setUp(self): self.file = tempfile.NamedTemporaryFile() self.filestore = c10d.FileStore(self.file.name) self.prefix = "test_prefix" self.filestore.set_timeout(timedelta(seconds=300))
def _create_store(self): store = c10d.FileStore(self.file.name) store.set_timeout(timedelta(seconds=300)) return store
def setUp(self): self.file = tempfile.NamedTemporaryFile() self.filestore = c10d.FileStore(self.file.name) self.prefix = "test_prefix"