def _forward_xp_core(self, x, gy, xp): # Compute filter weight gradient. # (n, _, out_1, out_2, ..., out_N) out_axes = (0,) + tuple(moves.range(2, self.ndim + 2)) # (n, _, _, ..., _, out_1, out_2, ..., out_N) col_axes = (0,) + tuple(moves.range(self.ndim + 2, self.ndim * 2 + 2)) # NumPy raises an error when the array is not contiguous. # See: https://github.com/chainer/chainer/issues/2744 # TODO(niboshi): Remove this code when NumPy is fixed. if (xp is numpy and not (gy.flags.c_contiguous or gy.flags.f_contiguous) and 1 in gy.shape): gy = numpy.ascontiguousarray(gy) if xp is numpy: col = conv_nd.im2col_nd_cpu( x, self.ksize, self.stride, self.pad, cover_all=self.cover_all, dilate=self.dilate) else: col = conv_nd.im2col_nd_gpu( x, self.ksize, self.stride, self.pad, cover_all=self.cover_all, dilate=self.dilate) gW = xp.tensordot(gy, col, (out_axes, col_axes)).astype( self.W_dtype, copy=False) return gW,
def _forward_xp(self, x, W, b, xp): ndim = self.ndim ksize = W.shape[2:] stride = self.stride pad = self.pad # Make patch array. if xp is numpy: self.col = conv_nd.im2col_nd_cpu(x, ksize, stride, pad, cover_all=self.cover_all) else: self.col = conv_nd.im2col_nd_gpu(x, ksize, stride, pad, cover_all=self.cover_all) # Compute correlation. axes = tuple(moves.range(1, ndim + 2)) # (1, 2, ..., N+1) y = xp.tensordot(self.col, W, (axes, axes)).astype(x.dtype) # Apply bias if given. if b is not None: y += b # Roll c_O before the second in (n, y_1, y_2, ..., y_N, c_O). return xp.rollaxis(y, ndim + 1, 1),
def _backward_xp(self, x, W, b, gy, xp): ndim = self.ndim ksize = W.shape[2:] stride = self.stride pad = self.pad if xp is numpy: col = conv_nd.im2col_nd_cpu(gy, ksize, stride, pad) else: col = conv_nd.im2col_nd_gpu(gy, ksize, stride, pad) # x : n, C_I, d_1, d_2, ..., d_N # col: n, C_I, k_1, k_2, ..., k_N, d_1, d_2, ..., d_N x_axes = (0,) + tuple(six.moves.range(2, ndim + 2)) col_axes = (0,) + tuple(six.moves.range(ndim + 2, ndim * 2 + 2)) gW = xp.tensordot(x, col, (x_axes, col_axes)).astype( W.dtype, copy=False) # col: n, C_I, k_1, k_2, ..., k_N, d_1, d_2, ..., d_N # W : C_I, C_O, k_1, k_2, ..., k_N axes = (1,) + tuple(six.moves.range(2, ndim + 2)) gx = xp.tensordot(col, W, (axes, axes)).astype(x.dtype, copy=False) gx = xp.rollaxis(gx, ndim + 1, 1) if b is None: return gx, gW else: sum_axis = (0,) + tuple(six.moves.range(2, ndim + 2)) gb = gy.sum(axis=sum_axis) return gx, gW, gb
def _backward_xp(self, x, W, b, gy, xp): ndim = self.ndim ksize = W.shape[2:] stride = self.stride pad = self.pad if xp is numpy: col = conv_nd.im2col_nd_cpu(gy, ksize, stride, pad) else: col = conv_nd.im2col_nd_gpu(gy, ksize, stride, pad) # x : n, C_I, d_1, d_2, ..., d_N # col: n, C_I, k_1, k_2, ..., k_N, d_1, d_2, ..., d_N x_axes = (0, ) + tuple(six.moves.range(2, ndim + 2)) col_axes = (0, ) + tuple(six.moves.range(ndim + 2, ndim * 2 + 2)) gW = xp.tensordot(x, col, (x_axes, col_axes)).astype(W.dtype, copy=False) # col: n, C_I, k_1, k_2, ..., k_N, d_1, d_2, ..., d_N # W : C_I, C_O, k_1, k_2, ..., k_N axes = (1, ) + tuple(six.moves.range(2, ndim + 2)) gx = xp.tensordot(col, W, (axes, axes)).astype(x.dtype, copy=False) gx = xp.rollaxis(gx, ndim + 1, 1) if b is None: return gx, gW else: sum_axis = (0, ) + tuple(six.moves.range(2, ndim + 2)) gb = gy.sum(axis=sum_axis) return gx, gW, gb
def _forward_xp_core(self, x, W, b, xp): ndim = self.ndim ksize = W.shape[2:] stride = self.stride pad = self.pad dilate = self.dilate # Make patch array. if xp is numpy: col = conv_nd.im2col_nd_cpu( x, ksize, stride, pad, cover_all=self.cover_all, dilate=dilate) else: col = conv_nd.im2col_nd_gpu( x, ksize, stride, pad, cover_all=self.cover_all, dilate=dilate) # Compute correlation. axes = tuple(moves.range(1, ndim + 2)) # (1, 2, ..., N+1) y = xp.tensordot(col, W, (axes, axes)).astype(x.dtype, copy=False) # Apply bias if given. if b is not None: y += b # Roll c_O before the second in (n, y_1, y_2, ..., y_N, c_O). return xp.rollaxis(y, ndim + 1, 1),
def _forward_xp_core(self, x, gy, xp): # Compute filter weight gradient. # (n, _, out_1, out_2, ..., out_N) out_axes = (0, ) + tuple(moves.range(2, self.ndim + 2)) # (n, _, _, ..., _, out_1, out_2, ..., out_N) col_axes = (0, ) + tuple(moves.range(self.ndim + 2, self.ndim * 2 + 2)) # NumPy raises an error when the array is not contiguous. # See: https://github.com/chainer/chainer/issues/2744 # TODO(niboshi): Remove this code when NumPy is fixed. if (xp is numpy and not (gy.flags.c_contiguous or gy.flags.f_contiguous) and 1 in gy.shape): gy = numpy.ascontiguousarray(gy) if xp is numpy: col = conv_nd.im2col_nd_cpu(x, self.ksize, self.stride, self.pad, cover_all=self.cover_all, dilate=self.dilate) else: col = conv_nd.im2col_nd_gpu(x, self.ksize, self.stride, self.pad, cover_all=self.cover_all, dilate=self.dilate) gW = xp.tensordot(gy, col, (out_axes, col_axes)).astype(self.W_dtype, copy=False) return gW,
def test_im2col_nd_gpu_parameter_ranks(self): img_gpu = cuda.to_gpu(self.img) # Invalid ksize length. with self.assertRaises(AssertionError): conv_nd.im2col_nd_gpu(img_gpu, (2,), self.stride, self.pad) # Invalid stride length. with self.assertRaises(AssertionError): conv_nd.im2col_nd_gpu(img_gpu, self.ksize, (1,), self.pad) # Invalid pad length. with self.assertRaises(AssertionError): conv_nd.im2col_nd_gpu(img_gpu, self.ksize, self.stride, (0,))
def forward_gpu(self, inputs): X, W, B, initial_ct = _as_contiguous(inputs[:4]) dtype = X.dtype xp = cuda.get_array_module(W) batchsize, feature_dimension, seq_length = X.shape mask_x = inputs[4] if len(inputs) == 5 else None if mask_x is not None: X *= mask_x[..., None] self.col = conv_nd.im2col_nd_gpu(X, (1, ), (1, ), (0, ), cover_all=False) self.U = _as_contiguous( xp.tensordot(self.col, W[..., None], ((1, 2), (1, 2))).astype(X.dtype, copy=False).transpose( (0, 2, 1))) # U = xp.matmul(W, X) total_columns = feature_dimension * batchsize thread_per_block = min(512, total_columns) num_block = math.ceil(total_columns / thread_per_block) assert thread_per_block * num_block >= total_columns H = xp.empty((batchsize, feature_dimension, seq_length), dtype=dtype) self.C = xp.empty((batchsize, feature_dimension, seq_length), dtype=dtype) self._cuda_elementwise("forward", args=[ X.data.ptr, self.U.data.ptr, B.data.ptr, initial_ct.data.ptr, self.C.data.ptr, H.data.ptr, batchsize, feature_dimension, seq_length, self.use_tanh ], block=(thread_per_block, 1, 1), grid=(num_block, 1, 1)) return H, self.C, self.C[..., -1]
def test_im2col_consistency(self): col_cpu = conv_nd.im2col_nd_cpu(self.x, self.ksize, self.stride, self.pad) col_gpu = conv_nd.im2col_nd_gpu(cuda.to_gpu(self.x), self.ksize, self.stride, self.pad) testing.assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
def test_im2col_consistency(self): col_cpu = conv_nd.im2col_nd_cpu( self.x, self.ksize, self.stride, self.pad) col_gpu = conv_nd.im2col_nd_gpu( cuda.to_gpu(self.x), self.ksize, self.stride, self.pad) testing.assert_allclose(col_cpu, col_gpu.get(), atol=0, rtol=0)
def backward_gpu(self, inputs, grad_outputs): X, W, B, initial_ct = _as_contiguous(inputs[:4]) dtype = X.dtype xp = cuda.get_array_module(W) batchsize, feature_dimension, seq_length = X.shape mask_x = inputs[4] if len(inputs) == 5 else None if mask_x is not None: X *= mask_x[..., None] total_columns = feature_dimension * batchsize thread_per_block = min(512, total_columns) num_block = total_columns // thread_per_block + 1 grad_x = xp.zeros_like(X) grad_highway_x = xp.zeros_like(X) grad_b = xp.zeros((batchsize, feature_dimension * 2, seq_length), dtype=dtype) grad_w = xp.zeros((batchsize, ) + W.shape, dtype=dtype) grad_u = xp.zeros((batchsize, feature_dimension * 3, seq_length), dtype=dtype) grad_initial_ct = xp.zeros_like(initial_ct) # initialize with zero incoming_grad_ct = xp.zeros_like( initial_ct) if grad_outputs[2] is None else _as_contiguous( grad_outputs[2]) incoming_grad_h = xp.zeros_like( X) if grad_outputs[0] is None else _as_contiguous(grad_outputs[0]) self._cuda_elementwise( "backward", args=[ X.data.ptr, self.U.data.ptr, B.data.ptr, self.C.data.ptr, initial_ct.data.ptr, incoming_grad_h.data.ptr, incoming_grad_ct.data.ptr, W.data.ptr, grad_highway_x.data.ptr, grad_u.data.ptr, grad_b.data.ptr, grad_initial_ct.data.ptr, batchsize, feature_dimension, seq_length, self.use_tanh ], block=(thread_per_block, 1, 1), grid=(num_block, 1, 1)) col = conv_nd.im2col_nd_gpu(grad_u, (1, ), (1, ), (0, ), cover_all=False) grad_x = xp.tensordot(col, W.T[..., None], ((1, 2), (1, 2))).astype(dtype, copy=False).transpose( (0, 2, 1)) + grad_highway_x if mask_x is not None: grad_x *= mask_x[..., None] grad_b = xp.sum(grad_b, axis=(0, 2)) grad_w = xp.tensordot(grad_u, self.col, ((0, 2), (0, 3))).astype(dtype, copy=False).reshape( (feature_dimension * 3, feature_dimension)) if len(inputs) == 5: return grad_x, grad_w, grad_b, grad_initial_ct, None return grad_x, grad_w, grad_b, grad_initial_ct