Пример #1
0
def ncrelu_backward(grad_output, mask):
    assert grad_output.get_device() == mask.get_device()
    assert grad_output.is_contiguous()
    n, c, h, w = mask.size()

    with torch.cuda.device_of(grad_output):
        grad_input = grad_output.new(mask.size())
        f = load_kernel('ncrelu_backward', kernels, Dtype=Dtype(grad_output))
        f(args=[
            grad_input.data_ptr(),
            mask.data_ptr(),
            grad_output.data_ptr(), c * h * w,
            mask.numel()
        ],
          block=(CUDA_NUM_THREADS, 1, 1),
          grid=(GET_BLOCKS(mask.numel()), 1, 1),
          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
    return grad_input
Пример #2
0
def ncrelu_forward(input):
    assert input.dim() == 4 and input.is_contiguous()
    n, c, h, w = input.size()

    with torch.cuda.device_of(input):
        output = input.new(n, 2 * c, h, w)
        mask = torch.cuda.ByteTensor(input.size())
        f = load_kernel('ncrelu_forward', kernels, Dtype=Dtype(input))
        f(args=[
            output.data_ptr(),
            mask.data_ptr(),
            input.data_ptr(), c * h * w,
            input.numel()
        ],
          block=(CUDA_NUM_THREADS, 1, 1),
          grid=(GET_BLOCKS(input.numel()), 1, 1),
          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
    return output, mask
Пример #3
0
    def forward(self, input, weight):
        assert input.dim() == 4 and input.is_cuda and weight.is_cuda
        batch_size, in_channels, bottom_height, bottom_width = input.size()
        out_channels, _, kernel_h, kernel_w = weight.size()
        print(in_channels, out_channels, batch_size)
        output_h = int((bottom_height + 2 * self.padding[0] -
                        (self.dilation[0] *
                         (kernel_h - 1) + 1)) / self.stride[0] + 1)
        output_w = int((bottom_width + 2 * self.padding[1] -
                        (self.dilation[1] *
                         (kernel_w - 1) + 1)) / self.stride[1] + 1)

        output = input.new(batch_size, out_channels, output_h, output_w)
        n = output.numel()

        with torch.cuda.device_of(input):
            f = load_kernel('conv2d_naive_forward_kernel',
                            _conv2d_naive_kernel,
                            Dtype=Dtype(input),
                            nthreads=n,
                            batch_size=batch_size,
                            in_channels=in_channels,
                            out_channels=out_channels,
                            bottom_height=bottom_height,
                            bottom_width=bottom_width,
                            top_height=output_h,
                            top_width=output_w,
                            kernel_h=kernel_h,
                            kernel_w=kernel_w,
                            stride_h=self.stride[0],
                            stride_w=self.stride[1],
                            dilation_h=self.dilation[0],
                            dilation_w=self.dilation[1],
                            pad_h=self.padding[0],
                            pad_w=self.padding[1])
            f(block=(CUDA_NUM_THREADS, 1, 1),
              grid=(GET_BLOCKS(n), 1, 1),
              args=[input.data_ptr(),
                    weight.data_ptr(),
                    output.data_ptr()],
              stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

        self.save_for_backward(input, weight)
        return output
Пример #4
0
def _col2im(data_col, kernel_size, stride, padding, out=None, input_size=None):
    assert data_col.dim() == 5
    ksize_h, ksize_w = _pair(kernel_size)
    stride_h, stride_w = _pair(stride)
    pad_h, pad_w = _pair(padding)
    n_input_plane, ksize_h, ksize_w, height_col, width_col = data_col.size()
    if input_size is not None:
        height, width = input_size
    else:
        height = (height_col - 1) * stride_h - 2 * pad_h + ksize_h
        width = (width_col - 1) * stride_w - 2 * pad_w + ksize_w
    n = n_input_plane * height * width

    if out is not None:
        assert tuple(out.size()) == (n_input_plane, height, width)
        data = out
    else:
        data = data_col.new(n_input_plane, height, width)

    with torch.cuda.device_of(data_col):
        f = load_kernel('col2im_kernel',
                        _col2im_kernel,
                        Dtype=Dtype(data),
                        n=n,
                        height_col=height_col,
                        width_col=width_col,
                        height=height,
                        width=width,
                        ksize_h=ksize_h,
                        ksize_w=ksize_w,
                        pad_h=pad_h,
                        pad_w=pad_w,
                        stride_h=stride_h,
                        stride_w=stride_w,
                        channels=n_input_plane)
        f(block=(CUDA_NUM_THREADS, 1, 1),
          grid=(GET_BLOCKS(n), 1, 1),
          args=[data_col.data_ptr(), data.data_ptr()],
          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
    return data
Пример #5
0
def _im2col(data, kernel_size, stride, padding, out=None):
    assert data.dim() == 3 and data.is_cuda
    ksize_h, ksize_w = _pair(kernel_size)
    stride_h, stride_w = _pair(stride)
    pad_h, pad_w = _pair(padding)
    n_input_plane, height, width = data.size()
    height_col = (height + 2 * pad_h - ksize_h) // stride_h + 1
    width_col = (width + 2 * pad_w - ksize_w) // stride_w + 1
    n = n_input_plane * height_col * width_col

    shape = torch.Size(
        (n_input_plane, ksize_h, ksize_w, height_col, width_col))
    if out is not None:
        assert out.size() == shape
        data_col = out
    else:
        data_col = data.new(*shape)

    with torch.cuda.device_of(data):
        f = load_kernel('im2col_kernel',
                        _im2col_kernel,
                        Dtype=Dtype(data),
                        n=n,
                        height_col=height_col,
                        width_col=width_col,
                        height=height,
                        width=width,
                        ksize_h=ksize_h,
                        ksize_w=ksize_w,
                        pad_h=pad_h,
                        pad_w=pad_w,
                        stride_h=stride_h,
                        stride_w=stride_w,
                        channels=n_input_plane)
        f(block=(CUDA_NUM_THREADS, 1, 1),
          grid=(GET_BLOCKS(n), 1, 1),
          args=[data.data_ptr(), data_col.data_ptr()],
          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
    return data_col
Пример #6
0
    def backward(self, grad_output):
        assert grad_output.is_cuda and grad_output.is_contiguous()
        input, weight = self.saved_tensors

        batch_size, channels, height, width = input.size()
        kernel_h, kernel_w = weight.size()[2:]
        output_h, output_w = grad_output.size()[2:]

        grad_input, grad_weight = None, None

        opt = dict(Dtype=Dtype(grad_output),
                   num=batch_size,
                   channels=channels,
                   bottom_height=height,
                   bottom_width=width,
                   top_height=output_h,
                   top_width=output_w,
                   kernel_h=kernel_h,
                   kernel_w=kernel_w,
                   stride_h=self.stride[0],
                   stride_w=self.stride[1],
                   dilation_h=self.dilation[0],
                   dilation_w=self.dilation[1],
                   pad_h=self.padding[0],
                   pad_w=self.padding[1])

        with torch.cuda.device_of(input):
            if self.needs_input_grad[0]:
                grad_input = input.new(input.size())

                n = grad_input.numel()
                opt['nthreads'] = n

                f = load_kernel('conv2d_dw_backward_grad_input_kernel',
                                _conv2d_depthwise_kernel_backward_grad_input,
                                **opt)
                f(block=(CUDA_NUM_THREADS, 1, 1),
                  grid=(GET_BLOCKS(n), 1, 1),
                  args=[
                      grad_output.data_ptr(),
                      weight.data_ptr(),
                      grad_input.data_ptr()
                  ],
                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

            if self.needs_input_grad[1]:
                weight_buffer = weight.new(channels, kernel_h, kernel_w,
                                           batch_size, output_h, output_w)

                n = weight_buffer.numel()
                opt['nthreads'] = n

                f = load_kernel('conv2d_dw_backward_grad_weight_kernel',
                                _conv2d_depthwise_kernel_backward_grad_weight,
                                **opt)
                f(block=(CUDA_NUM_THREADS, 1, 1),
                  grid=(GET_BLOCKS(n), 1, 1),
                  args=[
                      grad_output.data_ptr(),
                      input.data_ptr(),
                      weight_buffer.data_ptr()
                  ],
                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
                grad_weight = weight_buffer.view(weight.size() +
                                                 (-1, )).sum(-1)

        return grad_input, grad_weight
Пример #7
0
    def backward(self, grad_output):
        assert grad_output.is_cuda and grad_output.is_contiguous()
        input, weight = self.saved_tensors

        batch_size, in_channels, bottom_height, bottom_width = input.size()
        out_channels, _, kernel_h, kernel_w = weight.size()
        top_height, top_width = grad_output.size()[2:]

        grad_input, grad_weight = None, None

        opt = dict(Dtype=Dtype(grad_output),
                   batch_size=batch_size,
                   in_channels=in_channels,
                   out_channels=out_channels,
                   bottom_height=bottom_height,
                   bottom_width=bottom_width,
                   top_height=top_height,
                   top_width=top_width,
                   kernel_h=kernel_h,
                   kernel_w=kernel_w,
                   stride_h=self.stride[0],
                   stride_w=self.stride[1],
                   dilation_h=self.dilation[0],
                   dilation_w=self.dilation[1],
                   pad_h=self.padding[0],
                   pad_w=self.padding[1])

        with torch.cuda.device_of(input):
            if self.needs_input_grad[0]:
                grad_input = input.new(input.size())
                n = grad_input.numel()
                opt['nthreads'] = n
                weight_transposed = weight.permute(1, 0, 2, 3).contiguous()
                f = load_kernel('conv2d_naive_backward_grad_input_kernel',
                                _conv2d_naive_kernel_backward_grad_input,
                                **opt)
                f(block=(CUDA_NUM_THREADS, 1, 1),
                  grid=(GET_BLOCKS(n), 1, 1),
                  args=[
                      grad_output.data_ptr(),
                      weight_transposed.data_ptr(),
                      grad_input.data_ptr()
                  ],
                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
            else:
                grad_input = None

            if self.needs_input_grad[1]:
                weight_buffer = weight.new(out_channels, in_channels, kernel_h,
                                           kernel_w, batch_size, top_height,
                                           top_width)

                n = weight_buffer.numel()
                opt['nthreads'] = n

                f = load_kernel('conv2d_naive_backward_grad_weight_kernel',
                                _conv2d_naive_kernel_backward_grad_weight,
                                **opt)
                f(block=(CUDA_NUM_THREADS, 1, 1),
                  grid=(GET_BLOCKS(n), 1, 1),
                  args=[
                      grad_output.data_ptr(),
                      input.data_ptr(),
                      weight_buffer.data_ptr()
                  ],
                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
                grad_weight = weight_buffer.view(weight.size() +
                                                 (-1, )).sum(-1)
            else:
                grad_weight = None

        return grad_input, grad_weight