Exemplo n.º 1
0
    def _test_wait_stream(self, source, target, cuda_sleep=None):
        with use_stream(target):
            if is_cuda(target):
                cuda_sleep(0.5)
            x = torch.ones(100, 100, device=get_device(target))

        wait_stream(source, target)

        with use_stream(source):
            assert x.sum().item() == 10000
Exemplo n.º 2
0
def _test_copy_wait(prev_stream, next_stream, cuda_sleep=None):
    device = get_device(prev_stream)

    with use_stream(prev_stream):
        if is_cuda(prev_stream):
            cuda_sleep(0.5)
        x = torch.ones(100, device=device, requires_grad=True)

    y, = Copy.apply(prev_stream, next_stream, x)
    y, = Wait.apply(prev_stream, next_stream, x)

    with use_stream(next_stream):
        assert torch.allclose(y.sum(), torch.tensor(100.0, device=device))
        y.norm().backward()
    with use_stream(prev_stream):
        assert torch.allclose(x.grad.sum(), torch.tensor(10.0, device=device))
Exemplo n.º 3
0
def get_phony(device: torch.device, *, requires_grad: bool) -> Tensor:
    """Gets a phony. Phony is tensor without space. It is useful to make
    arbitrary dependency in a autograd graph because it doesn't require any
    gradient accumulation.

    .. note::

        Phonies for each device are cached. If an autograd function gets a phony
        internally, the phony must be detached to be returned. Otherwise, the
        autograd engine will mutate the cached phony in-place::

            class Phonify(torch.autograd.Function):
                @staticmethod
                def forward(ctx, input):
                    phony = get_phony(input.device, requires_grad=False)
                    return phony.detach()  # detach() is necessary.

    """
    key = (device, requires_grad)

    try:
        phony = _phonies[key]
    except KeyError:
        with use_stream(default_stream(device)):
            phony = torch.empty(0, device=device, requires_grad=requires_grad)

        _phonies[key] = phony

    return phony
Exemplo n.º 4
0
    def test_record_stream_cuda(self, cuda_sleep):
        # This test detects unexpected block reallocation. For reliable test,
        # the stream to allocate tensors is isolated. The allocator will not
        # reuse free blocks which were allocated from another stream.
        stream_alloc = new_stream(torch.device('cuda'))
        with torch.cuda.stream(stream_alloc):
            x = torch.rand(1, device=torch.device('cuda'))

        stream = new_stream(torch.device('cuda'))
        record_stream(x, stream)
        with use_stream(stream):
            cuda_sleep(0.5)

        # 'x' is deleted at Python's perspective. But the block of 'x' is still
        # required for 'stream'. 'y' shouldn't be allocated to the block.
        data_ptr = x.data_ptr()
        del x
        stream_alloc.synchronize()
        with torch.cuda.stream(stream_alloc):
            y = torch.rand(1, device=torch.device('cuda'))
        assert y.data_ptr() != data_ptr

        # Pause Python until 'stream' finishes tasks queued. Now the block of
        # 'x' is free to be reallocated.
        wait_stream(CPUStream, stream)
        with torch.cuda.stream(stream_alloc):
            z = torch.rand(1, device=torch.device('cuda'))
        assert z.data_ptr() == data_ptr
Exemplo n.º 5
0
 def __getitem__(self, device: torch.device) -> Tensor:
     try:
         return self.phonies[device]
     except KeyError:
         with use_stream(default_stream(device)):
             phony = torch.empty(0, device=device, requires_grad=True)
         self.phonies[device] = phony
         return phony
Exemplo n.º 6
0
    def backward(ctx: Context,
                 *grad_output: Tensor,
                 ) -> Tuple[Optional[Tensor], ...]:
        prev_stream = ctx.prev_stream
        next_stream = ctx.next_stream

        grad_input: Deque[Tensor] = deque(maxlen=len(grad_output))
        input_stream = current_stream(get_device(prev_stream))

        with use_stream(prev_stream), use_stream(next_stream):
            for x in reversed(grad_output):
                y = x.to(get_device(prev_stream))
                grad_input.appendleft(y)

                # 'next_stream' is not where 'x' has been allocated.
                record_stream(x, next_stream)
                # 'y' has been allocated on 'prev_stream'.
                # It might be used on the current stream captured as 'input_stream'.
                record_stream(y, input_stream)

        grad_streams: Tuple[Optional[Tensor], ...] = (None, None)
        return grad_streams + tuple(grad_input)
Exemplo n.º 7
0
    def forward(ctx: Context,  # type: ignore
                prev_stream: AbstractStream,
                next_stream: AbstractStream,
                *input: Tensor,
                ) -> Tensors:
        ctx.prev_stream = prev_stream
        ctx.next_stream = next_stream

        output = []
        output_stream = current_stream(get_device(next_stream))

        with use_stream(prev_stream), use_stream(next_stream):
            for x in input:
                y = x.to(get_device(next_stream))
                output.append(y)

                # 'prev_stream' is not where 'x' has been allocated.
                record_stream(x, prev_stream)
                # 'y' has been allocated on 'next_stream'.
                # It might be used on the current stream captured as 'output_stream'.
                record_stream(y, output_stream)

        return tuple(output)
Exemplo n.º 8
0
    def test_record_stream_shifted_view(self, cuda_sleep):
        # Issue: https://github.com/pytorch/pytorch/issues/27366
        stream_alloc = new_stream(torch.device('cuda'))
        with torch.cuda.stream(stream_alloc):
            x = torch.rand(2, device=torch.device('cuda'))

        y = x[1:]
        assert y.data_ptr() > x.data_ptr()

        stream = new_stream(torch.device('cuda'))
        with use_stream(stream):
            cuda_sleep(0.5)
        record_stream(y, stream)

        data_ptr = x.data_ptr()
        del x, y

        stream_alloc.synchronize()
        with torch.cuda.stream(stream_alloc):
            z = torch.rand(2, device=torch.device('cuda'))
        assert z.data_ptr() != data_ptr
Exemplo n.º 9
0
 def finalize(self, batch: Batch) -> None:
     if self._finalize is None:
         return
     with use_stream(self.stream):
         self._finalize(batch)
Exemplo n.º 10
0
 def compute(self) -> Batch:
     with use_stream(self.stream):
         return self._compute()
Exemplo n.º 11
0
 def test_use_stream_cuda(self):
     stream = new_stream(torch.device('cuda'))
     with use_stream(stream):
         assert current_stream(torch.device('cuda')) == stream
Exemplo n.º 12
0
 def test_use_stream_cpu(self):
     with use_stream(CPUStream):
         pass