def test_col2im_consistency(self): col = conv.im2col_cpu(self.x, 3, 3, 2, 2, 2, 2, dy=2, dx=2) h, w = self.x.shape[2:] im_cpu = conv.col2im_cpu(col, 2, 2, 2, 2, h, w, dy=2, dx=2) im_gpu = conv.col2im_gpu( cuda.to_gpu(col), 2, 2, 2, 2, h, w, dy=2, dx=2) testing.assert_allclose(im_cpu, im_gpu.get())
def forward_gpu(self, x): xp = cuda.cupy n, c, h, w = x[0].shape if self.outh is None: self.outh = conv.get_deconv_outsize(h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize(w, self.kw, self.sx, self.pw, cover_all=self.cover_all) up_y = xp.zeros((n, c, self.outh, self.outw), dtype=numpy.float32) up_y = conv.im2col_gpu(up_y, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) n, c, oy, ox, ky, kx = up_y.shape indexes = xp.asarray(self.indexes, dtype=numpy.int32) xp.ElementwiseKernel( "int32 index, float32 x, int32 n, int32 c, int32 oy, int32 ox," "int32 ky, int32 kx", "raw float32 up_y", """ int yn = i / c / oy / ox; int yc = (i / oy / ox) % c; int yoy = (i / ox) % oy; int yox = i % ox; up_y[yn * c * oy * ox * ky * kx + yc * oy * ox * ky * kx + yoy * ox * ky * kx + yox * ky * kx + index] = x; """, "upsampling_2d_fwd", )(indexes, x[0], n, c, oy, ox, ky, kx, up_y) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) up_y = conv.col2im_gpu(up_y, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return (up_y,)
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) if not gy.flags.c_contiguous: gy = cuda.cupy.ascontiguousarray(gy) gy_desc = cudnn.create_tensor_descriptor(gy) dtype = x.dtype one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes libcudnn.convolutionBackwardFilter( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(inputs[2]) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW_mat = gW.reshape(out_c, c * kh * kw) col_mats = self.col.reshape(n, c * kh * kw, out_h * out_w) gy_mats = gy.reshape(n, out_c, out_h * out_w) # TODO(beam2d): Use streams or batch gemm gW_mat[...] = 0 for i in moves.range(n): gW_mat += cuda.cupy.dot(gy_mats[i], col_mats[i].T) W_mat = W.reshape(out_c, -1) gcol = cuda.cupy.empty_like(self.col) gcol_mats = gcol.reshape(n, c * kh * kw, out_h * out_w) for i in moves.range(n): cuda.cupy.dot(W_mat.T, gy_mats[i], gcol_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward(self, x): self.retain_inputs(()) h, w = x[0].shape[2:] if self.outh is None: self.outh = conv.get_deconv_outsize(h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize(w, self.kw, self.sx, self.pw, cover_all=self.cover_all) xp = cuda.get_array_module(*x) col = xp.tile(x[0][:, :, None, None], (1, 1, self.kh, self.kw, 1, 1)) if xp is numpy: y = conv.col2im_cpu(col, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) else: y = conv.col2im_gpu(col, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return y,
def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x[0]) gy_arr = gy[0] if not gy_arr.flags.c_contiguous: gy_arr = cuda.cupy.ascontiguousarray(gy_arr) gy_desc = cudnn.create_tensor_descriptor(gy_arr) one = ctypes.c_float(1) zero = ctypes.c_float(0) if self.b is not None: libcudnn.convolutionBackwardBias(handle, one, gy_desc.value, gy_arr.data.ptr, one, self.bias_desc.value, self.gb.data.ptr) libcudnn.convolutionBackwardFilter(handle, one, x_desc.value, x[0].data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, one, self.filter_desc.value, self.gW.data.ptr) gx = cuda.empty_like(x[0]) libcudnn.convolutionBackwardData(handle, one, self.filter_desc.value, self.W.data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, zero, x_desc.value, gx.data.ptr) else: if self.gb is not None: self.gb += gy[0].sum(axis=(0, 2, 3)) # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape(n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): gW_mat += cuda.cupy.dot(gy_mats[i], col_mats[i].T) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.cupy.dot(W_mat.T, gy_mats[i], gcol_mats[i]) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) if cuda.cudnn_enabled and self.use_cudnn: x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=numpy.float32) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) one = numpy.array(1, dtype=x.dtype).ctypes zero = numpy.array(0, dtype=x.dtype).ctypes libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: libcudnn.addTensor_v2( handle, libcudnn.CUDNN_ADD_SAME_C, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: W_mat = W.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) gcol = cuda.cupy.empty( (n, c, kh, kw, in_h, in_w), dtype=numpy.float32) gcol_mats = gcol.reshape(n, c * kh * kw, in_h * in_w) for i in moves.range(n): cuda.cupy.dot(W_mat.T, x_mats[i], gcol_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) if cuda.cudnn_enabled and self.use_cudnn: x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=numpy.float32) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx)) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) one = numpy.array(1, dtype=x.dtype).ctypes zero = numpy.array(0, dtype=x.dtype).ctypes libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: libcudnn.addTensor_v2(handle, libcudnn.CUDNN_ADD_SAME_C, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: W_mat = W.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) gcol = cuda.cupy.empty((n, c, kh, kw, in_h, in_w), dtype=numpy.float32) gcol_mats = gcol.reshape(n, c * kh * kw, in_h * in_w) for i in moves.range(n): cuda.cupy.dot(W_mat.T, x_mats[i], gcol_mats[i]) y = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], h, w) gy_desc = cudnn.get_tensor_desc(gy[0], out_h, out_w) if self.b is not None: libcudnn.cudnnConvolutionBackwardBias( handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]), 1, self.bias_desc.value, cudnn.get_ptr(self.gb)) libcudnn.cudnnConvolutionBackwardFilter( handle, 1, x_desc.value, cudnn.get_ptr(x[0]), gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value, 1, self.filter_desc.value, cudnn.get_ptr(self.gW)) gx = cuda.empty_like(x[0]) libcudnn.cudnnConvolutionBackwardData( handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W), gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value, 0, x_desc.value, cudnn.get_ptr(gx)) else: handle = cuda.get_cublas_handle() if self.gb is not None: # TODO(beam2d): Unify kernels with cuda.using_cumisc(handle): tmp = cuda.cumisc.sum( gy[0].reshape(n * out_c, out_h * out_w), axis=1) tmp = cuda.cumisc.sum(tmp.reshape(n, out_c), axis=0) self.gb += tmp # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle, out=gcol_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def backward_gpu(self, x, gy): out_c, out_h, out_w = gy[0].shape[1:] n, c, h, w = x[0].shape if cuda.cudnn_enabled and self.use_cudnn: handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x[0]) gy_arr = gy[0] if not gy_arr.flags.c_contiguous: gy_arr = cuda.cupy.ascontiguousarray(gy_arr) gy_desc = cudnn.create_tensor_descriptor(gy_arr) one = ctypes.c_float(1) zero = ctypes.c_float(0) if self.b is not None: libcudnn.convolutionBackwardBias( handle, one, gy_desc.value, gy_arr.data.ptr, one, self.bias_desc.value, self.gb.data.ptr) libcudnn.convolutionBackwardFilter( handle, one, x_desc.value, x[0].data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, one, self.filter_desc.value, self.gW.data.ptr) gx = cuda.empty_like(x[0]) libcudnn.convolutionBackwardData( handle, one, self.filter_desc.value, self.W.data.ptr, gy_desc.value, gy_arr.data.ptr, self.conv_desc.value, zero, x_desc.value, gx.data.ptr) else: handle = cuda.get_cublas_handle() if self.gb is not None: self.gb += gy[0].sum(axis=(0, 2, 3)) # TODO(beam2d): Use streams gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw) col_mats = self.col.reshape( n, c * self.kh * self.kw, out_h * out_w) gy_mats = gy[0].reshape(n, out_c, out_h * out_w) for i in moves.range(n): cuda.culinalg.add_dot( gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle) W_mat = self.W.reshape(out_c, c * self.kh * self.kw) gcol = cuda.empty_like(self.col) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle, out=gcol_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) return gx,
def forward(self, x): h, w = x[0].shape[2:] if self.outh is None: self.outh = conv.get_deconv_outsize(h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize(w, self.kw, self.sx, self.pw, cover_all=self.cover_all) xp = cuda.get_array_module(*x) col = xp.tile(x[0][:, :, xp.newaxis, xp.newaxis], (1, 1, self.kh, self.kw, 1, 1)) if isinstance(x[0], cuda.ndarray): y = conv.col2im_gpu(col, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) else: y = conv.col2im_cpu(col, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return (y,)
def forward(self, x): h, w = x[0].shape[2:] n = x[0].shape[0] c = x[0].shape[1] indexes = x[1] if self.outh is None: self.outh = conv.get_deconv_outsize(h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize(w, self.kw, self.sx, self.pw, cover_all=self.cover_all) xp = cuda.get_array_module(*x) col = xp.tile(x[0][:, :, xp.newaxis, xp.newaxis], (1, 1, self.kh, self.kw, 1, 1)) # NOTE(hvy): Take indexes(Switches) into account # TODO(hvy): Remove the loops and make it efficient y = xp.zeros_like(col) if isinstance(x[0], cuda.ndarray): indexes = cuda.cupy.asnumpy(indexes) for n_i in range(n): for c_i in range(c): for r in range(h): for c in range(w): index = indexes[n_i][c_i][r][c] if index < self.kw: y[n_i][c_i].T[c][r][index][0] = col[n_i][c_i].T[c][ r][index][0] else: y[n_i][c_i].T[c][r][ index % self.kw][1] = col[n_i][c_i].T[c][r][index % self.kw][1] if isinstance(x[0], cuda.ndarray): y = conv.col2im_gpu(y, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) else: y = conv.col2im_cpu(y, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return y,
def _forward_gpu_core(self, x, W, b): # Implementation using col2im gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw, dy=self.dy, dx=self.dx) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def backward(self, inputs, grad_outputs): x, = inputs xp = cuda.get_array_module(x) gy, = grad_outputs n, _, out_h, out_w = gy.shape _, c, h, w = x.shape gy = gy.reshape(n, c, self.kh, self.kw, out_h, out_w) if xp == numpy: gx = col2im_cpu( gy, self.sy, self.sx, self.ph, self.pw, h, w, self.dy, self.dx) else: gx = col2im_gpu( gy, self.sy, self.sx, self.ph, self.pw, h, w, self.dy, self.dx) return gx,
def backward(self, inputs, grad_outputs): x, = inputs xp = cuda.get_array_module(x) gy, = grad_outputs n, _, out_h, out_w = gy.shape _, c, h, w = x.shape gy = gy.reshape(n, c, self.kh, self.kw, out_h, out_w) if xp == numpy: gx = col2im_cpu(gy, self.sy, self.sx, self.ph, self.pw, h, w, self.dy, self.dx) else: gx = col2im_gpu(gy, self.sy, self.sx, self.ph, self.pw, h, w, self.dy, self.dx) return gx,
def forward_gpu(self, x): self.retain_inputs(()) self._in_dtype = x[0].dtype xp = cuda.cupy n, c, h, w = x[0].shape if self.outh is None: self.outh = conv.get_deconv_outsize(h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize(w, self.kw, self.sx, self.pw, cover_all=self.cover_all) up_y = xp.zeros((n, c, self.outh, self.outw), dtype=numpy.float32) up_y = conv.im2col_gpu(up_y, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) n, c, oy, ox, ky, kx = up_y.shape indexes = xp.asarray(self.indexes, dtype=numpy.int32) xp.ElementwiseKernel( 'int32 index, float32 x, int32 n, int32 c, int32 oy, int32 ox,' 'int32 ky, int32 kx', 'raw float32 up_y', ''' int yn = i / c / oy / ox; int yc = (i / oy / ox) % c; int yoy = (i / ox) % oy; int yox = i % ox; up_y[yn * c * oy * ox * ky * kx + yc * oy * ox * ky * kx + yoy * ox * ky * kx + yox * ky * kx + index] = x; ''', 'upsampling_2d_fwd')(indexes, x[0], n, c, oy, ox, ky, kx, up_y) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) up_y = conv.col2im_gpu(up_y, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return up_y,
def forward_gpu(self, x): n, out_c, out_h, out_w = x[0].shape c = self.W.shape[1] h = get_deconv_outsize(out_h, self.kh, self.sy, self.ph) w = get_deconv_outsize(out_w, self.kw, self.sx, self.pw) if cudnn.enabled and self.use_cudnn: handle = cudnn.get_default_handle() x_desc = cudnn.get_tensor_desc(x[0], out_h, out_w) y = cuda.empty((n, c, h, w), dtype=numpy.float32) y_desc = cudnn.get_tensor_desc(y, h, w) self.filter_desc = cudnn.get_filter4d_desc(self.W) self.conv_desc = cudnn.get_conv2d_desc( (self.ph, self.pw), (self.sy, self.sx)) if self.b is not None: self.bias_desc = cudnn.get_conv_bias_desc(self.b) libcudnn.cudnnConvolutionBackwardData( handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W), x_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value, 0, y_desc.value, cudnn.get_ptr(y)) if self.b is not None: libcudnn.cudnnAddTensor( handle, libcudnn.cudnnAddMode['CUDNN_ADD_SAME_C'], 1, self.bias_desc.value, cudnn.get_ptr(self.b), 1, y_desc.value, cudnn.get_ptr(y)) else: handle = cuda.get_cublas_handle() # TODO(beam2d): Use streams W_mat = self.W.reshape(out_c, c * self.kh * self.kw) x_mats = x[0].reshape(n, out_c, out_h * out_w) gcol = cuda.empty((n, c, self.kh, self.kw, out_h, out_w), dtype=numpy.float32) gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w) for i in moves.range(n): cuda.culinalg.dot(W_mat, x_mats[i], transa='T', handle=handle, out=gcol_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) # TODO(beam2d): Support unshared bias if self.b is not None: cuda.elementwise( 'float* y, const float* b, int c, int hw', 'y[i] += b[i / hw % c]', 'conv_bias_fwd')(y, self.b, c, h * w) return y,
def backward(self, inputs, grad_outputs): x, W = inputs[:2] xp = cuda.get_array_module(*x) W = xp.where(W >= 0, 1, -1).astype(numpy.float32, copy=False) W = self.M * W b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] h, w = x.shape[2:] xp = cuda.get_array_module(*x) B, C, KY, KX, IY, IX = self.col.shape D = W.shape[0] # (B, C*D, IY, IX) -> (C, D, B*IY*IX, D) gy_ = gy.reshape((B, C, D, IY * IX)).transpose(1, 2, 0, 3) \ .reshape((C, D, B * IY * IX)) c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \ .reshape((C, B * IY * IX, KY * KX)) # (C, D, B*IY*IX), (C, B*IY*IX, KY*KX) -> (C, D, KY*KX) gW_ = _matmul(gy_, c_, xp) gW = gW_.reshape((C, D, KY, KX)).transpose(1, 0, 2, 3) gW = gW.astype(W.dtype, copy=False) w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D)) # (C, KY*KX, D), (C, D, B*IY*IX) -> (C, KY*KX, B*IY*IX) gcol = _matmul(w_, gy_, xp).reshape((C, KY, KX, B, IY, IX)) gcol = gcol.astype(x.dtype, copy=False) gcol = xp.rollaxis(gcol, 3) if xp is numpy: gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) else: gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is None: return gx, gW else: gy = xp.rollaxis(gy, 1, 4) gb = gy.sum(axis=(0, 1, 2)) return gx, gW, gb
def forward(self, x): h, w = x[0].shape[2:] if self.outh is None: self.outh = conv.get_deconv_outsize( h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize( w, self.kw, self.sx, self.pw, cover_all=self.cover_all) xp = cuda.get_array_module(*x) col = xp.tile(x[0][:, :, None, None], (1, 1, self.kh, self.kw, 1, 1)) if xp is numpy: y = conv.col2im_cpu(col, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) else: y = conv.col2im_gpu(col, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return y,
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # only retain x and W if len(inputs) == 2: (x, W), b = inputs, None else: x, W, b = inputs self._calc_out_size(x, W) self._set_cover_all(x, W) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or (_cudnn_version_ >= 6000 and not configuration.config.cudnn_deterministic))): # cuDNN implementation return self._forward_cudnn(x, W, b) else: # Implementation using col2im gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw, dy=self.dy, dx=self.dx) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def backward(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] h, w = x.shape[2:] xp = cuda.get_array_module(*x) B, C, KY, KX, IY, IX = self.col.shape D = W.shape[0] # (B, C*D, IY, IX) -> (C, D, B*IY*IX, D) gy_ = gy.reshape((B, C, D, IY * IX)).transpose(1, 2, 0, 3) \ .reshape((C, D, B * IY * IX)) c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \ .reshape((C, B * IY * IX, KY * KX)) # (C, D, B*IY*IX), (C, B*IY*IX, KY*KX) -> (C, D, KY*KX) gW_ = _matmul(gy_, c_, xp) gW = gW_.reshape((C, D, KY, KX)).transpose(1, 0, 2, 3) gW = gW.astype(W.dtype, copy=False) w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D)) # (C, KY*KX, D), (C, D, B*IY*IX) -> (C, KY*KX, B*IY*IX) gcol = _matmul(w_, gy_, xp).reshape((C, KY, KX, B, IY, IX)) gcol = gcol.astype(x.dtype, copy=False) gcol = xp.rollaxis(gcol, 3) if xp is numpy: gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) else: gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is None: return gx, gW else: gy = xp.rollaxis(gy, 1, 4) gb = gy.sum(axis=(0, 1, 2)) return gx, gW, gb
def forward_gpu(self, x): self.retain_inputs(()) self._in_dtype = x[0].dtype xp = cuda.cupy n, c, h, w = x[0].shape if self.outh is None: self.outh = conv.get_deconv_outsize( h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize( w, self.kw, self.sx, self.pw, cover_all=self.cover_all) up_y = xp.zeros((n, c, self.outh, self.outw), dtype=self._in_dtype) up_y = conv.im2col_gpu( up_y, self.kh, self.kw, self.sy, self.sx, self.ph, self.pw, cover_all=self.cover_all) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) n, c, oy, ox, ky, kx = up_y.shape indexes = xp.asarray(self.indexes, dtype=numpy.int32) xp.ElementwiseKernel( 'int32 index, T x, int32 n, int32 c, int32 oy, int32 ox,' 'int32 ky, int32 kx', 'raw T up_y', ''' int yn = i / c / oy / ox; int yc = (i / oy / ox) % c; int yoy = (i / ox) % oy; int yox = i % ox; up_y[yn * c * oy * ox * ky * kx + yc * oy * ox * ky * kx + yoy * ox * ky * kx + yox * ky * kx + index] = x; ''', 'upsampling_2d_fwd')(indexes, x[0], n, c, oy, ox, ky, kx, up_y) up_y = up_y.transpose(0, 1, 4, 5, 2, 3) up_y = conv.col2im_gpu(up_y, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return up_y,
def forward(self, x): h, w = x[0].shape[2:] if self.outh is None: self.outh = conv.get_deconv_outsize(h, self.kh, self.sy, self.ph, cover_all=self.cover_all) if self.outw is None: self.outw = conv.get_deconv_outsize(w, self.kw, self.sx, self.pw, cover_all=self.cover_all) xp = cuda.get_array_module(*x) col = xp.tile(x[0][:, :, xp.newaxis, xp.newaxis], (1, 1, self.kh, self.kw, 1, 1)) if isinstance(x[0], cuda.ndarray): y = conv.col2im_gpu(col, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) else: y = conv.col2im_cpu(col, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.empty_like(x) if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW_mat = gW.reshape(out_c, c * kh * kw) col_mats = self.col.reshape(n, c * kh * kw, out_h * out_w) gy_mats = gy.reshape(n, out_c, out_h * out_w) # TODO(beam2d): Use streams or batch gemm gW_mat[...] = 0 for i in moves.range(n): gW_mat += cuda.cupy.dot(gy_mats[i], col_mats[i].T) W_mat = W.reshape(out_c, -1) gcol = cuda.cupy.empty_like(self.col) gcol_mats = gcol.reshape(n, c * kh * kw, out_h * out_w) for i in moves.range(n): gcol_mats[i] = cuda.cupy.dot(W_mat.T, gy_mats[i]) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) assert self.outw > 0, 'Width in the output should be positive.' if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) else: libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: W_mat = W.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) gcol = cuda.cupy.empty( (n, c, kh, kw, in_h, in_w), dtype=x.dtype) gcol_mats = gcol.reshape(n, c * kh * kw, in_h * in_w) for i in moves.range(n): gcol_mats[i] = cuda.cupy.dot(W_mat.T, x_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) assert self.outw > 0, 'Width in the output should be positive.' if (chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor(b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 3000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) else: if configuration.config.cudnn_deterministic: raise ValueError( "`cudnn_deterministic` option must be False " "if the forward propagation of " "chainer.functions.Deconvolution2D " "uses cuDNN and cuDNN versions < v3. " "Turn off cudnn_deterministic option with " "`chainer.using_config('cudnn_deterministic', False)` " "context.") libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor(handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) gx = None if (not self.cover_all and chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 3000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if self.requires_x_grad: if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: if configuration.config.cudnn_deterministic: raise ValueError( "`cudnn_deterministic` option must be False " "if the backpropagation of " "chainer.functions.Convolution2D " "uses cuDNN and cuDNN versions < v3. " "Turn off cudnn_deterministic option with " "`chainer.using_config('cudnn_deterministic', False)` " "context.") libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) if self.requires_x_grad: gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot( gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) if self.requires_x_grad: gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward(self, inputs, grad_outputs): x, W = inputs[:2] if self.bcoeffs is not None: xp = cuda.get_array_module(*x) olen, ilen, hlen, wlen = W.shape if self.coeffs is None: self.coeffs = numpy.ones(ilen) coeffs = numpy.copy(self.bcoeffs) coeffs = numpy.expand_dims(coeffs, 1) coeffs = numpy.expand_dims(coeffs, 1) coeffs = numpy.expand_dims(coeffs, 0) coeffs = numpy.broadcast_to(coeffs, W.shape) self.mW = xp.asarray(coeffs, numpy.float32).reshape(W.shape) if self.ocoeffs is not None: xp = cuda.get_array_module(*x) coeffs = numpy.copy(self.ocoeffs) self.mb = xp.asarray(coeffs, numpy.float32) W = self.M * W b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] h, w = x.shape[2:] xp = cuda.get_array_module(*x) B, C, KY, KX, IY, IX = self.col.shape D = W.shape[0] # (B, C*D, IY, IX) -> (C, D, B*IY*IX, D) gy_ = gy.reshape((B, C, D, IY * IX)).transpose(1, 2, 0, 3) \ .reshape((C, D, B * IY * IX)) c_ = self.col.transpose(1, 0, 4, 5, 2, 3) \ .reshape((C, B * IY * IX, KY * KX)) # (C, D, B*IY*IX), (C, B*IY*IX, KY*KX) -> (C, D, KY*KX) gW_ = _matmul(gy_, c_, xp) gW = gW_.reshape((C, D, KY, KX)).transpose(1, 0, 2, 3) gW = gW.astype(W.dtype, copy=False) w_ = W.transpose(1, 2, 3, 0).reshape((C, KY * KX, D)) # (C, KY*KX, D), (C, D, B*IY*IX) -> (C, KY*KX, B*IY*IX) gcol = _matmul(w_, gy_, xp).reshape((C, KY, KX, B, IY, IX)) gcol = gcol.astype(x.dtype, copy=False) gcol = xp.rollaxis(gcol, 3) if xp is numpy: gx = conv.col2im_cpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) else: gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) if hasattr(self, 'mW'): gW = self.mW * gW if hasattr(self, 'mb'): xp = cuda.get_array_module(*x) gW = xp.broadcast_to( xp.expand_dims( xp.expand_dims(xp.expand_dims(self.mb, 1), 1), 0), gW.shape) * gW if b is None: return gx, gW else: gy = xp.rollaxis(gy, 1, 4) gb = gy.sum(axis=(0, 1, 2)) if hasattr(self, 'mb'): gb = self.mb * gb return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) if (self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.empty_like(x) if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW_mat = gW.reshape(out_c, c * kh * kw) col_mats = self.col.reshape(n, c * kh * kw, out_h * out_w) gy_mats = gy.reshape(n, out_c, out_h * out_w) # TODO(beam2d): Use streams or batch gemm gW_mat[...] = 0 for i in moves.range(n): gW_mat += cuda.cupy.dot(gy_mats[i], col_mats[i].T) W_mat = W.reshape(out_c, -1) Wb_mat = _kern()(W_mat) gcol = cuda.cupy.empty_like(self.col) gcol_mats = gcol.reshape(n, c * kh * kw, out_h * out_w) for i in moves.range(n): gcol_mats[i] = cuda.cupy.dot(Wb_mat.T, gy_mats[i]) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def test_col2im_consistency(self): col = conv.im2col_cpu(self.x, 3, 3, 2, 2, 1, 1) h, w = self.x.shape[2:] im_cpu = conv.col2im_cpu(col, 2, 2, 1, 1, h, w) im_gpu = conv.col2im_gpu(cuda.to_gpu(col), 2, 2, 1, 1, h, w) gradient_check.assert_allclose(im_cpu, im_gpu.get())
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) gW = cuda.cupy.empty_like(W) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): pad_x = cuda.cupy.zeros( (n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 out_sh = out_h + (out_h - 1) * (self.sy - 1) out_sw = out_w + (out_w - 1) * (self.sx - 1) gy_ph = (h + dkh - out_sh - 1) / 2 gy_pw = (w + dkw - out_sw - 1) / 2 pad_gy = cuda.cupy.zeros( (n, out_c, h + dkh - 1, w + dkw - 1), dtype=x.dtype) pad_gy[:, :, gy_ph:gy_ph + out_sh:self.sy, gy_pw:gy_pw + out_sw:self.sx] = gy for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy:j * self.dy + out_h_s1, i * self.dx:i * self.dx + out_w_s1]) gyji = cuda.cupy.ascontiguousarray( pad_gy[:, :, j * self.dy:j * self.dy + h, i * self.dx:i * self.dx + w]) Wji = cuda.cupy.ascontiguousarray( W[:, :, -1::-1, -1::-1][:, :, j:j + 1, i:i + 1]) if i == 0 and j == 0: x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) xji_desc = cudnn.create_tensor_descriptor(xji) gy_desc = cudnn.create_tensor_descriptor(gy) gyji_desc = cudnn.create_tensor_descriptor(gyji) conv_desc_data = cudnn.create_convolution_descriptor( (0, 0), (1, 1), xji.dtype) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.zeros_like(x) gWji = cuda.cupy.empty((out_c, c, 1, 1), dtype=W.dtype) if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty( (workspace_size,), dtype='b') algo_filter = ( libcudnn.getConvolutionBackwardFilterAlgorithm( handle, xji_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size)) algo_data = ( libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gyji_desc.value, conv_desc_data.value, x_desc.value, _bwd_data_pref, workspace_size)) if _cudnn_version >= 4000: libcudnn.convolutionBackwardFilter_v3( handle, one.data, xji_desc.value, xji.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo_filter, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gWji.data.ptr) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, Wji.data.ptr, gyji_desc.value, gyji.data.ptr, conv_desc_data.value, algo_data, workspace.data.ptr, workspace_size, one.data, x_desc.value, gx.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, xji_desc.value, xji.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gWji.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, Wji.data.ptr, gyji_desc.value, gyji.data.ptr, conv_desc_data.value, one.data, x_desc.value, gx.data.ptr) gW[:, :, j:j + 1, i:i + 1] = gWji if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot( gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w, dy=self.dy, dx=self.dx) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) assert self.outw > 0, 'Width in the output should be positive.' if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if not self.deterministic: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) else: libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def _col2im(x, *args, **kwargs): if isinstance(x, numpy.ndarray): return col2im_cpu(x, *args, **kwargs) return col2im_gpu(x, *args, **kwargs)
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError( 'numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}'.format( type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}'.format( type(W), type(x))) gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.empty_like(x) if _cudnn_version >= 3000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if not self.deterministic: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if not self.deterministic: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: if self.deterministic: raise ValueError("'deterministic' option not available " "for cuDNN versions < v3") libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot(gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) assert self.outw > 0, 'Width in the output should be positive.' if (cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if not self.deterministic: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) else: libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: W_mat = W.reshape(in_c, c * kh * kw) x_mats = x.reshape(n, in_c, in_h * in_w) gcol = cuda.cupy.empty( (n, c, kh, kw, in_h, in_w), dtype=x.dtype) gcol_mats = gcol.reshape(n, c * kh * kw, in_h * in_w) for i in moves.range(n): gcol_mats[i] = cuda.cupy.dot(W_mat.T, x_mats[i]) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] if self.bcoeffs is not None: olen, ilen, hlen, wlen = W.shape if self.coeffs is None: self.coeffs = numpy.ones(ilen) coeffs = numpy.copy(self.bcoeffs) coeffs = numpy.expand_dims(coeffs, 1) coeffs = numpy.expand_dims(coeffs, 1) coeffs = numpy.expand_dims(coeffs, 0) coeffs = numpy.broadcast_to(coeffs, W.shape) self.mW = cuda.cupy.asarray(coeffs,numpy.float32).reshape(W.shape) if self.ocoeffs is not None: coeffs = numpy.copy(self.ocoeffs) self.mb = cuda.cupy.asarray(coeffs,numpy.float32) W = self.M*W b = inputs[2] if len(inputs) == 3 else None if not type_check.same_types(*inputs): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) gx = None if (not self.cover_all and chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if _cudnn_version >= 3000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if self.requires_x_grad: if configuration.config.cudnn_deterministic: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: if configuration.config.cudnn_deterministic: raise ValueError( "`cudnn_deterministic` option must be False " "if the backpropagation of " "chainer.functions.Convolution2D " "uses cuDNN and cuDNN versions < v3. " "Turn off cudnn_deterministic option with " "`chainer.using_config('cudnn_deterministic', False)` " "context.") libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) if self.requires_x_grad: gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot( gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) if self.requires_x_grad: gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) # gW = self.mW * gW if hasattr(self,'mW'): gW = self.mW * gW if hasattr(self,'mb'): xp = cuda.get_array_module(*x) gW = xp.broadcast_to( xp.expand_dims(xp.expand_dims(xp.expand_dims(self.mb,1),1),1) ,gW.shape) * gW if b is None: return gx, gW else: if hasattr(self,'mb'): gb = self.mb * gb return gx, gW, gb
def forward_gpu(self, inputs): self.retain_inputs((0, 1)) # only retain x and W x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph, d=self.dy) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw, d=self.dx) assert self.outw > 0, 'Width in the output should be positive.' self._set_cover_all(x, W) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype and ((self.dy == 1 and self.dx == 1) or _cudnn_version >= 6000)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) use_tensor_core = chainer.should_use_cudnn_tensor_core(x.dtype) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) filter_desc = cudnn.create_filter_descriptor(W) conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype, dilation=(self.dy, self.dx), use_tensor_core=use_tensor_core) if b is not None: bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, filter_desc.value, x_desc.value, conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) if use_tensor_core: # Only CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 supports # Tensor-Core in cuDNN7 algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 libcudnn.convolutionBackwardData_v3( handle, one.data, filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw, dy=self.dy, dx=self.dx) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) gx = None if (not self.cover_all and chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 else: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if self.requires_x_grad: if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot(gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) if self.requires_x_grad: gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) if cuda.cudnn_enabled and self.use_cudnn: x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) dtype = x.dtype one = numpy.array(1, dtype=dtype).ctypes zero = numpy.array(0, dtype=dtype).ctypes libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) gx = cuda.cupy.empty_like(x) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW_mat = gW.reshape(out_c, c * kh * kw) col_mats = self.col.reshape(n, c * kh * kw, out_h * out_w) gy_mats = gy.reshape(n, out_c, out_h * out_w) # TODO(beam2d): Use streams or batch gemm gW_mat[...] = 0 for i in moves.range(n): gW_mat += cuda.cupy.dot(gy_mats[i], col_mats[i].T) W_mat = W.reshape(out_c, -1) gcol = cuda.cupy.empty_like(self.col) gcol_mats = gcol.reshape(n, c * kh * kw, out_h * out_w) for i in moves.range(n): cuda.cupy.dot(W_mat.T, gy_mats[i], gcol_mats[i]) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def forward_gpu(self, inputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None if not all([isinstance(i, cuda.ndarray) for i in inputs]): if b is not None: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}, type(b): {2}' .format(type(W), type(x), type(b))) else: raise ValueError('numpy and cupy must not be used together\n' 'type(W): {0}, type(x): {1}' .format(type(W), type(x))) kh, kw = W.shape[2:] n, in_c, in_h, in_w = x.shape c = W.shape[1] # out_c if self.outh is None: self.outh = conv.get_deconv_outsize(in_h, kh, self.sy, self.ph) assert self.outh > 0, 'Height in the output should be positive.' if self.outw is None: self.outw = conv.get_deconv_outsize(in_w, kw, self.sx, self.pw) assert self.outw > 0, 'Width in the output should be positive.' if chainer.should_use_cudnn('>=auto') and x.dtype == W.dtype: x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) if b is not None: b = cuda.cupy.ascontiguousarray(b) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) y = cuda.cupy.empty((n, c, self.outh, self.outw), dtype=x.dtype) y_desc = cudnn.create_tensor_descriptor(y) self.filter_desc = cudnn.create_filter_descriptor(W) self.conv_desc = cudnn.create_convolution_descriptor( (self.ph, self.pw), (self.sy, self.sx), x.dtype) if b is not None: self.bias_desc = cudnn.create_tensor_descriptor( b[None, :, None, None]) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if configuration.config.cudnn_deterministic: algo = libcudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 else: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, x_desc.value, self.conv_desc.value, y_desc.value, _bwd_data_pref, workspace_size) libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, x_desc.value, x.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, y_desc.value, y.data.ptr) if b is not None: cudnn.add_tensor( handle, one.data, self.bias_desc.value, b.data.ptr, one.data, y_desc.value, y.data.ptr) else: gcol = cuda.cupy.tensordot(W, x, (0, 1)).astype(x.dtype, copy=False) # - k, m, n: shape of out_channel # - b: number of inputs # - h, w: height and width of kernels # k, m, n, b, h, w -> b, k, m, n, h, w gcol = cuda.cupy.rollaxis(gcol, 3) y = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, self.outh, self.outw) if b is not None: y += b.reshape(1, b.size, 1, 1) return y,
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] dkh, dkw = kh + (kh - 1) * (self.dy - 1), kw + (kw - 1) * (self.dx - 1) gW = cuda.cupy.empty_like(W) if (not self.cover_all and chainer.should_use_cudnn('>=auto') and _check_cudnn_acceptable_type(x.dtype, W.dtype)): pad_x = cuda.cupy.zeros((n, c, h + 2 * self.ph, w + 2 * self.pw), dtype=x.dtype) pad_x[:, :, self.ph:self.ph + h, self.pw:self.pw + w] = x out_h_s1 = h + 2 * self.ph - dkh + 1 out_w_s1 = w + 2 * self.pw - dkw + 1 out_sh = out_h + (out_h - 1) * (self.sy - 1) out_sw = out_w + (out_w - 1) * (self.sx - 1) gy_ph = (h + dkh - out_sh - 1) / 2 gy_pw = (w + dkw - out_sw - 1) / 2 pad_gy = cuda.cupy.zeros((n, out_c, h + dkh - 1, w + dkw - 1), dtype=x.dtype) pad_gy[:, :, gy_ph:gy_ph + out_sh:self.sy, gy_pw:gy_pw + out_sw:self.sx] = gy gx = None for j in moves.range(kh): for i in moves.range(kw): xji = cuda.cupy.ascontiguousarray( pad_x[:, :, j * self.dy:j * self.dy + out_h_s1, i * self.dx:i * self.dx + out_w_s1]) gyji = cuda.cupy.ascontiguousarray( pad_gy[:, :, j * self.dy:j * self.dy + h, i * self.dx:i * self.dx + w]) Wji = cuda.cupy.ascontiguousarray(W[:, :, -1::-1, -1::-1][:, :, j:j + 1, i:i + 1]) if i == 0 and j == 0: x = cuda.cupy.ascontiguousarray(x) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) xji_desc = cudnn.create_tensor_descriptor(xji) gy_desc = cudnn.create_tensor_descriptor(gy) gyji_desc = cudnn.create_tensor_descriptor(gyji) conv_desc_data = cudnn.create_convolution_descriptor( (0, 0), (1, 1), xji.dtype) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes if self.requires_x_grad: gx = cuda.cupy.zeros_like(x) gWji = cuda.cupy.empty((out_c, c, 1, 1), dtype=W.dtype) if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size, ), dtype='b') algo_filter = ( libcudnn.getConvolutionBackwardFilterAlgorithm( handle, xji_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size)) algo_data = ( libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gyji_desc.value, conv_desc_data.value, x_desc.value, _bwd_data_pref, workspace_size)) if _cudnn_version >= 4000: libcudnn.convolutionBackwardFilter_v3( handle, one.data, xji_desc.value, xji.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo_filter, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gWji.data.ptr) else: libcudnn.convolutionBackwardFilter_v2( handle, one.data, xji_desc.value, xji.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gWji.data.ptr) if self.requires_x_grad: if _cudnn_version >= 4000: libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, Wji.data.ptr, gyji_desc.value, gyji.data.ptr, conv_desc_data.value, algo_data, workspace.data.ptr, workspace_size, one.data, x_desc.value, gx.data.ptr) else: libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, Wji.data.ptr, gyji_desc.value, gyji.data.ptr, conv_desc_data.value, one.data, x_desc.value, gx.data.ptr) gW[:, :, j:j + 1, i:i + 1] = gWji if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias(handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot(gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) if not self.requires_x_grad: gx = None else: gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu(gcol, self.sy, self.sx, self.ph, self.pw, h, w, dy=self.dy, dx=self.dx) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb
def backward_gpu(self, inputs, grad_outputs): x, W = inputs[:2] b = inputs[2] if len(inputs) == 3 else None gy = grad_outputs[0] _, out_c, out_h, out_w = gy.shape n, c, h, w = x.shape kh, kw = W.shape[2:] gW = cuda.cupy.empty_like(W) if (not self.cover_all and cuda.cudnn_enabled and self.use_cudnn and _check_cudnn_acceptable_type(x.dtype, W.dtype)): x = cuda.cupy.ascontiguousarray(x) W = cuda.cupy.ascontiguousarray(W) gy = cuda.cupy.ascontiguousarray(gy) handle = cudnn.get_handle() x_desc = cudnn.create_tensor_descriptor(x) gy_desc = cudnn.create_tensor_descriptor(gy) oz_dtype = 'd' if x.dtype == 'd' else 'f' one = numpy.array(1, dtype=oz_dtype).ctypes zero = numpy.array(0, dtype=oz_dtype).ctypes gx = cuda.cupy.empty_like(x) if _cudnn_version >= 4000: workspace_size = cuda.get_max_workspace_size() workspace = cuda.cupy.empty((workspace_size,), dtype='b') if not self.deterministic: algo = libcudnn.getConvolutionBackwardFilterAlgorithm( handle, x_desc.value, gy_desc.value, self.conv_desc.value, self.filter_desc.value, _bwd_filter_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 # NOQA libcudnn.convolutionBackwardFilter_v3( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, self.filter_desc.value, gW.data.ptr) if not self.deterministic: algo = libcudnn.getConvolutionBackwardDataAlgorithm( handle, self.filter_desc.value, gy_desc.value, self.conv_desc.value, x_desc.value, _bwd_data_pref, workspace_size) else: algo = cuda.cupy.cuda.cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 # NOQA libcudnn.convolutionBackwardData_v3( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, algo, workspace.data.ptr, workspace_size, zero.data, x_desc.value, gx.data.ptr) else: if self.deterministic: raise ValueError("'deterministic' option not available " "for cuDNN versions < v4") libcudnn.convolutionBackwardFilter_v2( handle, one.data, x_desc.value, x.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, self.filter_desc.value, gW.data.ptr) libcudnn.convolutionBackwardData_v2( handle, one.data, self.filter_desc.value, W.data.ptr, gy_desc.value, gy.data.ptr, self.conv_desc.value, zero.data, x_desc.value, gx.data.ptr) if b is not None: gb = cuda.cupy.empty_like(b) libcudnn.convolutionBackwardBias( handle, one.data, gy_desc.value, gy.data.ptr, zero.data, self.bias_desc.value, gb.data.ptr) else: gW = cuda.cupy.tensordot( gy, self.col, ((0, 2, 3), (0, 4, 5))).astype(W.dtype, copy=False) gcol = cuda.cupy.tensordot(W, gy, (0, 1)).astype(x.dtype, copy=False) gcol = cuda.cupy.rollaxis(gcol, 3) gx = conv.col2im_gpu( gcol, self.sy, self.sx, self.ph, self.pw, h, w) if b is not None: gb = gy.sum(axis=(0, 2, 3)) if b is None: return gx, gW else: return gx, gW, gb