def _forward_cudnn(self, x, gy): _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape iC = c iCg = int(iC / self.group) gW = cuda.cupy.empty((out_c, iCg, self.kh, self.kw), dtype=self.W_dtype) x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) filter_desc = cudnn.create_filter_descriptor(gW) conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype dilation = (self.dy, self.dx) conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=dilation, use_tensor_core=use_tensor_core, group=self.group) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 elif configuration.config.autotune and _cudnn_version >= 5000: algo = _get_algorithm_bwd_filter(x, gy, gW, conv_param + (dilation, ), handle, x_desc, gy_desc, conv_desc, filter_desc, workspace) else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, conv_desc.value, filter_desc.value, _bwd_filter_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 supports # Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 libcudnn.convolutionBackwardFilter_v3(handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, filter_desc.value, gW.data.ptr) return gW,
def _forward_cudnn(self, x, gy): _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape iC = c iCg = int(iC / self.group) gW = cuda.cupy.empty((out_c, iCg, self.kh, self.kw), dtype=self.W_dtype) x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) # cuDNN 7 supports dilation only in *_BWD_FILTER_ALGO_0, but # it supports Tensor Cores only in *_BWD_FILTER_ALGO_1. if use_tensor_core and (self.dx > 1 or self.dy > 1): use_tensor_core = False handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) filter_desc = cudnn.create_filter_descriptor(gW) conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype dilation = (self.dy, self.dx) conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=dilation, use_tensor_core=use_tensor_core, group=self.group) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 elif configuration.config.autotune and _cudnn_version >= 5000: algo = _get_algorithm_bwd_filter( x, gy, gW, conv_param + (dilation,), handle, x_desc, gy_desc, conv_desc, filter_desc, workspace) else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, conv_desc.value, filter_desc.value, _bwd_filter_pref, workspace_size) if use_tensor_core: algo = self._tensor_core_adjust_algo() libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, filter_desc.value, gW.data.ptr) return gW,
def _forward_cudnn(self, x, W, b): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) n = x.shape[0] out_c = W.shape[1] use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, out_c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype dilation = (self.dy, self.dx) conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=dilation, use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 elif configuration.config.autotune and _cudnn_version_ >= 5000: algo = get_algorithm(W, x, y, conv_param + (dilation, ), handle, filter_desc, x_desc, conv_desc, y_desc, workspace) else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, filter_desc.value, x_desc.value, conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 supports # Tensor-Core in cuDNN7 algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 libcudnn.convolutionBackwardData_v3( handle, one.data, filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor(handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) return y,
def _forward_cudnn(self, x, W, b, y): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) # cuDNN 7 supports dilation only in *_FWD_ALGO_IMPLICIT_GEMM, but # it supports Tensor Cores only in *_FWD_ALGO_IMPLICIT_PRECOMP_GEMM. if use_tensor_core and (self.dx > 1 or self.dy > 1): use_tensor_core = False handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_param = ((self.ph, self.pw), (self.sy, self.sx), x.dtype) dilation = (self.dy, self.dx) conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=dilation, use_tensor_core=use_tensor_core, group=self.group) if b is not None: bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.autotune and _cudnn_version >= 5000: algo = _get_algorithm_fwd( x, W, y, conv_param + (dilation,), handle, x_desc, filter_desc, conv_desc, y_desc, workspace) else: algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, filter_desc.value, conv_desc.value, y_desc.value, _fwd_pref, workspace_size) if use_tensor_core: algo = self._tensor_core_adjust_algo() oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, filter_desc.value, W.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor( handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) return y,
def _forward_cudnn(self, x, W, b, y): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) # cuDNN 7 supports dilation only in *_FWD_ALGO_IMPLICIT_GEMM, but # it supports Tensor Cores only in *_FWD_ALGO_IMPLICIT_PRECOMP_GEMM. if use_tensor_core and (self.dx > 1 or self.dy > 1): use_tensor_core = False handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_param = ((self.ph, self.pw), (self.sy, self.sx), x.dtype) dilation = (self.dy, self.dx) conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=dilation, use_tensor_core=use_tensor_core, group=self.group) if b is not None: bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if configuration.config.autotune and _cudnn_version >= 5000: algo = _get_algorithm_fwd(x, W, y, conv_param + (dilation, ), handle, x_desc, filter_desc, conv_desc, y_desc, workspace) else: algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, filter_desc.value, conv_desc.value, y_desc.value, _fwd_pref, workspace_size) if use_tensor_core: algo = self._tensor_core_adjust_algo() oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr, filter_desc.value, W.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor(handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) x, gy = inputs _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape if (self.cover_all or not chainer.should_use_cudnn('>=auto') or x.dtype != self.W_dtype): col = conv.im2col_gpu( x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) gW = cuda.cupy.tensordot( gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.W_dtype, copy=False) return gW, gW = cuda.cupy.empty((out_c, c, self.kh, self.kw), dtype=self.W_dtype) x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) filter_desc = cudnn.create_filter_descriptor(gW) conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype, use_tensor_core=use_tensor_core) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, conv_desc.value, filter_desc.value, _bwd_filter_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 supports # Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, filter_desc.value, gW.data.ptr) return gW,
def _forward_cudnn(self, x, gy): # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) # Make empty arrays for result. out_c = gy.shape[1] in_c = x.shape[1] gW = cuda.cupy.empty((out_c, in_c) + self.ksize, dtype=self.W_dtype) # Get cuDNN handler and descriptors. use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) filter_desc = cudnn.create_filter_descriptor(gW) conv_param = (self.pad, self.stride, self.W_dtype) conv_desc = cudnn.create_convolution_descriptor( *conv_param, use_tensor_core=use_tensor_core) # Compute gradients. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') # Compute filter weight gradient. if configuration.config.autotune and _cudnn_version_ >= 5000: algo = convolution_2d._get_algorithm_bwd_filter( x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, filter_desc, workspace) else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, conv_desc.value, filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3(handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, filter_desc.value, gW.data.ptr) return gW,
def _forward_cudnn(self, x, gy): # Convert to C-contiguous arrays. x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) # Make empty arrays for result. out_c = gy.shape[1] in_c = x.shape[1] gW = cuda.cupy.empty( (out_c, in_c) + self.ksize, dtype=self.W_dtype) # Get cuDNN handler and descriptors. use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) filter_desc = cudnn.create_filter_descriptor(gW) conv_param = (self.pad, self.stride, self.W_dtype) conv_desc = cudnn.create_convolution_descriptor( *conv_param, use_tensor_core=use_tensor_core) # Compute gradients. oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') # Compute filter weight gradient. if configuration.config.autotune and _cudnn_version_ >= 5000: algo = convolution_2d._get_algorithm_bwd_filter( x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, filter_desc, workspace) else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, conv_desc.value, filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, filter_desc.value, gW.data.ptr) return gW,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # only retain x and W x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph, d=self.dy) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw, d=self.dx) assert self.outw > 0, 'Width in the output should be positive.' self._set_cover_all(x, W) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, filter_desc.value, x_desc.value, conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 supports # Tensor-Core in cuDNN7 algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 libcudnn.convolutionBackwardData_v3( handle, one.data, filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw, dy=self.dy, dx=self.dx) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # retain only x and W x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) assert out_h > 0, 'Height in the output should be positive.' out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) assert out_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, filter_desc.value, conv_desc.value, y_desc.value, _fwd_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # supports Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # NOQA oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward(handle, one.data, x_desc.value, x.data.ptr, filter_desc.value, W.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor(handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu(x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot(col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,
def _forward_cudnn(self, x, W, b): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) n = x.shape[0] # out_c = W.shape[1] yCg = W.shape[1] yC = yCg * self.group use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) # cuDNN 7 supports dilation only in *_BWD_DATA_ALGO_0, but # it supports Tensor Cores only in *_BWD_DATA_ALGO_1. if use_tensor_core and (self.dx > 1 or self.dy > 1): use_tensor_core = False handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, yC, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype dilation = (self.dy, self.dx) conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=dilation, use_tensor_core=use_tensor_core, group=self.group) if b is not None: bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 elif configuration.config.autotune and _cudnn_version_ >= 5000: algo = get_algorithm( W, x, y, conv_param + (dilation,), handle, filter_desc, x_desc, conv_desc, y_desc, workspace) else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, filter_desc.value, x_desc.value, conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) if use_tensor_core: algo = self._tensor_core_adjust_algo() libcudnn.convolutionBackwardData_v3( handle, one.data, filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # only retain x and W x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph, d=self.dy) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw, d=self.dx) assert self.outw > 0, 'Width in the output should be positive.' self._set_cover_all(x, W) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version_ >= 6000)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 elif configuration.config.autotune and _cudnn_version_ >= 5000: algo = get_algorithm(W, x, y, conv_param, handle, filter_desc, x_desc, conv_desc, y_desc, workspace) else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, filter_desc.value, x_desc.value, conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 supports # Tensor-Core in cuDNN7 algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 libcudnn.convolutionBackwardData_v3( handle, one.data, filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw, dy=self.dy, dx=self.dx) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) x, gy = inputs _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape if (self.cover_all or not chainer.should_use_cudnn('>=auto') or x.dtype != self.W_dtype or ((self.dy > 1 or self.dx > 1) and _cudnn_version < 6000)): col = conv.im2col_gpu( x, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) gW = cuda.cupy.tensordot( gy, col, ((0, 2, 3), (0, 4, 5))).astype(self.W_dtype, copy=False) return gW, gW = cuda.cupy.empty((out_c, c, self.kh, self.kw), dtype=self.W_dtype) x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) filter_desc = cudnn.create_filter_descriptor(gW) conv_param = (self.ph, self.pw), (self.sy, self.sx), x.dtype conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 elif configuration.config.autotune and _cudnn_version >= 5000: algo = get_algorithm_bwd_filter( x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, filter_desc, workspace) else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, conv_desc.value, filter_desc.value, _bwd_filter_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 supports # Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, filter_desc.value, gW.data.ptr) return gW,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # retain only x and W x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) out_c, _, kh, kw = W.shape n, c, h, w = x.shape out_h = conv.get_conv_outsize(h, kh, self.sy, self.ph, cover_all=self.cover_all, d=self.dy) assert out_h > 0, 'Height in the output should be positive.' out_w = conv.get_conv_outsize(w, kw, self.sx, self.pw, cover_all=self.cover_all, d=self.dx) assert out_w > 0, 'Width in the output should be positive.' y = cuda.cupy.empty((n, out_c, out_h, out_w), dtype=x.dtype) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_param = ((self.ph, self.pw), (self.sy, self.sx), x.dtype) conv_desc = cudnn.create_convolution_descriptor( *conv_param, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.autotune and _cudnn_version >= 5000: algo = get_algorithm_fwd( x, W, y, conv_param, handle, x_desc, filter_desc, conv_desc, y_desc, workspace) else: algo = libcudnn.getConvolutionForwardAlgorithm( handle, x_desc.value, filter_desc.value, conv_desc.value, y_desc.value, _fwd_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # supports Tensor-Core in cuDNN7. algo = libcudnn.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM # NOQA oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes libcudnn.convolutionForward( handle, one.data, x_desc.value, x.data.ptr, filter_desc.value, W.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) # TODO(beam2d): Support unshared bias if b is not None: cudnn.add_tensor( handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: # Implementation using im2col col = conv.im2col_gpu( x, kh, kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all, dy=self.dy, dx=self.dx) y = cuda.cupy.tensordot( col, W, ((1, 2, 3), (1, 2, 3))).astype(x.dtype, copy=False) # TODO(beam2d): Support unshared bias if b is not None: y += b y = cuda.cupy.rollaxis(y, 3, 1) return y,