def _test_wait_stream(self, source, target, cuda_sleep=None): with use_stream(target): if is_cuda(target): cuda_sleep(0.5) x = torch.ones(100, 100, device=get_device(target)) wait_stream(source, target) with use_stream(source): assert x.sum().item() == 10000
def _test_copy_wait(prev_stream, next_stream, cuda_sleep=None): device = get_device(prev_stream) with use_stream(prev_stream): if is_cuda(prev_stream): cuda_sleep(0.5) x = torch.ones(100, device=device, requires_grad=True) (y, ) = Copy.apply(prev_stream, next_stream, x) (y, ) = Wait.apply(prev_stream, next_stream, x) with use_stream(next_stream): assert torch.allclose(y.sum(), torch.tensor(100.0, device=device)) y.norm().backward() with use_stream(prev_stream): assert torch.allclose(x.grad.sum(), torch.tensor(10.0, device=device))
def test_record_stream_cuda(self, cuda_sleep): # This test detects unexpected block reallocation. For reliable test, # the stream to allocate tensors is isolated. The allocator will not # reuse free blocks which were allocated from another stream. stream_alloc = new_stream(torch.device("cuda")) with torch.cuda.stream(stream_alloc): x = torch.rand(1, device=torch.device("cuda")) stream = new_stream(torch.device("cuda")) record_stream(x, stream) with use_stream(stream): cuda_sleep(0.5) # 'x' is deleted at Python's perspective. But the block of 'x' is still # required for 'stream'. 'y' shouldn't be allocated to the block. data_ptr = x.data_ptr() del x stream_alloc.synchronize() with torch.cuda.stream(stream_alloc): y = torch.rand(1, device=torch.device("cuda")) assert y.data_ptr() != data_ptr # Pause Python until 'stream' finishes tasks queued. Now the block of # 'x' is free to be reallocated. wait_stream(CPUStream, stream) with torch.cuda.stream(stream_alloc): z = torch.rand(1, device=torch.device("cuda")) assert z.data_ptr() == data_ptr
def run(self, pipeline_record: DistributedPipelineRecord) -> None: """Runs pipeline parallelism. It modifies the given batches in place.""" m = len(pipeline_record.batches) self.stream = current_stream(self.device) for chunk in range(m): with record_function("feed"): pipeline_record.wait_for(chunk) pipeline_record.fence(chunk) self.compute(pipeline_record, chunk) with use_stream(self.stream): pipeline_record.forward_results(chunk)
def test_record_stream_shifted_view(self, cuda_sleep): # Issue: https://github.com/pytorch/pytorch/issues/27366 stream_alloc = new_stream(torch.device("cuda")) with torch.cuda.stream(stream_alloc): x = torch.rand(2, device=torch.device("cuda")) y = x[1:] assert y.data_ptr() > x.data_ptr() stream = new_stream(torch.device("cuda")) with use_stream(stream): cuda_sleep(0.5) record_stream(y, stream) data_ptr = x.data_ptr() del x, y stream_alloc.synchronize() with torch.cuda.stream(stream_alloc): z = torch.rand(2, device=torch.device("cuda")) assert z.data_ptr() != data_ptr
def test_use_stream_cuda(self): stream = new_stream(torch.device("cuda")) with use_stream(stream): assert current_stream(torch.device("cuda")) == stream
def test_use_stream_cpu(self): with use_stream(CPUStream): pass