def ncrelu_backward(grad_output, mask): assert grad_output.get_device() == mask.get_device() assert grad_output.is_contiguous() n, c, h, w = mask.size() with torch.cuda.device_of(grad_output): grad_input = grad_output.new(mask.size()) f = load_kernel('ncrelu_backward', kernels, Dtype=Dtype(grad_output)) f(args=[ grad_input.data_ptr(), mask.data_ptr(), grad_output.data_ptr(), c * h * w, mask.numel() ], block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(mask.numel()), 1, 1), stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return grad_input
def ncrelu_forward(input): assert input.dim() == 4 and input.is_contiguous() n, c, h, w = input.size() with torch.cuda.device_of(input): output = input.new(n, 2 * c, h, w) mask = torch.cuda.ByteTensor(input.size()) f = load_kernel('ncrelu_forward', kernels, Dtype=Dtype(input)) f(args=[ output.data_ptr(), mask.data_ptr(), input.data_ptr(), c * h * w, input.numel() ], block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(input.numel()), 1, 1), stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return output, mask
def forward(self, input, weight): assert input.dim() == 4 and input.is_cuda and weight.is_cuda batch_size, in_channels, bottom_height, bottom_width = input.size() out_channels, _, kernel_h, kernel_w = weight.size() print(in_channels, out_channels, batch_size) output_h = int((bottom_height + 2 * self.padding[0] - (self.dilation[0] * (kernel_h - 1) + 1)) / self.stride[0] + 1) output_w = int((bottom_width + 2 * self.padding[1] - (self.dilation[1] * (kernel_w - 1) + 1)) / self.stride[1] + 1) output = input.new(batch_size, out_channels, output_h, output_w) n = output.numel() with torch.cuda.device_of(input): f = load_kernel('conv2d_naive_forward_kernel', _conv2d_naive_kernel, Dtype=Dtype(input), nthreads=n, batch_size=batch_size, in_channels=in_channels, out_channels=out_channels, bottom_height=bottom_height, bottom_width=bottom_width, top_height=output_h, top_width=output_w, kernel_h=kernel_h, kernel_w=kernel_w, stride_h=self.stride[0], stride_w=self.stride[1], dilation_h=self.dilation[0], dilation_w=self.dilation[1], pad_h=self.padding[0], pad_w=self.padding[1]) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[input.data_ptr(), weight.data_ptr(), output.data_ptr()], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) self.save_for_backward(input, weight) return output
def _col2im(data_col, kernel_size, stride, padding, out=None, input_size=None): assert data_col.dim() == 5 ksize_h, ksize_w = _pair(kernel_size) stride_h, stride_w = _pair(stride) pad_h, pad_w = _pair(padding) n_input_plane, ksize_h, ksize_w, height_col, width_col = data_col.size() if input_size is not None: height, width = input_size else: height = (height_col - 1) * stride_h - 2 * pad_h + ksize_h width = (width_col - 1) * stride_w - 2 * pad_w + ksize_w n = n_input_plane * height * width if out is not None: assert tuple(out.size()) == (n_input_plane, height, width) data = out else: data = data_col.new(n_input_plane, height, width) with torch.cuda.device_of(data_col): f = load_kernel('col2im_kernel', _col2im_kernel, Dtype=Dtype(data), n=n, height_col=height_col, width_col=width_col, height=height, width=width, ksize_h=ksize_h, ksize_w=ksize_w, pad_h=pad_h, pad_w=pad_w, stride_h=stride_h, stride_w=stride_w, channels=n_input_plane) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[data_col.data_ptr(), data.data_ptr()], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return data
def _im2col(data, kernel_size, stride, padding, out=None): assert data.dim() == 3 and data.is_cuda ksize_h, ksize_w = _pair(kernel_size) stride_h, stride_w = _pair(stride) pad_h, pad_w = _pair(padding) n_input_plane, height, width = data.size() height_col = (height + 2 * pad_h - ksize_h) // stride_h + 1 width_col = (width + 2 * pad_w - ksize_w) // stride_w + 1 n = n_input_plane * height_col * width_col shape = torch.Size( (n_input_plane, ksize_h, ksize_w, height_col, width_col)) if out is not None: assert out.size() == shape data_col = out else: data_col = data.new(*shape) with torch.cuda.device_of(data): f = load_kernel('im2col_kernel', _im2col_kernel, Dtype=Dtype(data), n=n, height_col=height_col, width_col=width_col, height=height, width=width, ksize_h=ksize_h, ksize_w=ksize_w, pad_h=pad_h, pad_w=pad_w, stride_h=stride_h, stride_w=stride_w, channels=n_input_plane) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[data.data_ptr(), data_col.data_ptr()], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return data_col
def backward(self, grad_output): assert grad_output.is_cuda and grad_output.is_contiguous() input, weight = self.saved_tensors batch_size, channels, height, width = input.size() kernel_h, kernel_w = weight.size()[2:] output_h, output_w = grad_output.size()[2:] grad_input, grad_weight = None, None opt = dict(Dtype=Dtype(grad_output), num=batch_size, channels=channels, bottom_height=height, bottom_width=width, top_height=output_h, top_width=output_w, kernel_h=kernel_h, kernel_w=kernel_w, stride_h=self.stride[0], stride_w=self.stride[1], dilation_h=self.dilation[0], dilation_w=self.dilation[1], pad_h=self.padding[0], pad_w=self.padding[1]) with torch.cuda.device_of(input): if self.needs_input_grad[0]: grad_input = input.new(input.size()) n = grad_input.numel() opt['nthreads'] = n f = load_kernel('conv2d_dw_backward_grad_input_kernel', _conv2d_depthwise_kernel_backward_grad_input, **opt) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[ grad_output.data_ptr(), weight.data_ptr(), grad_input.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) if self.needs_input_grad[1]: weight_buffer = weight.new(channels, kernel_h, kernel_w, batch_size, output_h, output_w) n = weight_buffer.numel() opt['nthreads'] = n f = load_kernel('conv2d_dw_backward_grad_weight_kernel', _conv2d_depthwise_kernel_backward_grad_weight, **opt) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[ grad_output.data_ptr(), input.data_ptr(), weight_buffer.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) grad_weight = weight_buffer.view(weight.size() + (-1, )).sum(-1) return grad_input, grad_weight
def backward(self, grad_output): assert grad_output.is_cuda and grad_output.is_contiguous() input, weight = self.saved_tensors batch_size, in_channels, bottom_height, bottom_width = input.size() out_channels, _, kernel_h, kernel_w = weight.size() top_height, top_width = grad_output.size()[2:] grad_input, grad_weight = None, None opt = dict(Dtype=Dtype(grad_output), batch_size=batch_size, in_channels=in_channels, out_channels=out_channels, bottom_height=bottom_height, bottom_width=bottom_width, top_height=top_height, top_width=top_width, kernel_h=kernel_h, kernel_w=kernel_w, stride_h=self.stride[0], stride_w=self.stride[1], dilation_h=self.dilation[0], dilation_w=self.dilation[1], pad_h=self.padding[0], pad_w=self.padding[1]) with torch.cuda.device_of(input): if self.needs_input_grad[0]: grad_input = input.new(input.size()) n = grad_input.numel() opt['nthreads'] = n weight_transposed = weight.permute(1, 0, 2, 3).contiguous() f = load_kernel('conv2d_naive_backward_grad_input_kernel', _conv2d_naive_kernel_backward_grad_input, **opt) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[ grad_output.data_ptr(), weight_transposed.data_ptr(), grad_input.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) else: grad_input = None if self.needs_input_grad[1]: weight_buffer = weight.new(out_channels, in_channels, kernel_h, kernel_w, batch_size, top_height, top_width) n = weight_buffer.numel() opt['nthreads'] = n f = load_kernel('conv2d_naive_backward_grad_weight_kernel', _conv2d_naive_kernel_backward_grad_weight, **opt) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[ grad_output.data_ptr(), input.data_ptr(), weight_buffer.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) grad_weight = weight_buffer.view(weight.size() + (-1, )).sum(-1) else: grad_weight = None return grad_input, grad_weight