def backward_gpu(self, x, gy):
        out_c, out_h, out_w = gy[0].shape[1:]
        n, c, h, w = x[0].shape

        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            x_desc = cudnn.get_tensor_desc(x[0], h, w)
            gy_desc = cudnn.get_tensor_desc(gy[0], out_h, out_w)
            if self.b is not None:
                libcudnn.cudnnConvolutionBackwardBias(
                    handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                    1, self.bias_desc.value, cudnn.get_ptr(self.gb))

            libcudnn.cudnnConvolutionBackwardFilter(
                handle, 1, x_desc.value, cudnn.get_ptr(x[0]),
                gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value,
                1, self.filter_desc.value, cudnn.get_ptr(self.gW))

            gx = cuda.empty_like(x[0])
            libcudnn.cudnnConvolutionBackwardData(
                handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W),
                gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value,
                0, x_desc.value, cudnn.get_ptr(gx))
        else:
            handle = cuda.get_cublas_handle()
            if self.gb is not None:
                # TODO(beam2d): Unify kernels
                with cuda.using_cumisc(handle):
                    tmp = cuda.cumisc.sum(
                        gy[0].reshape(n * out_c, out_h * out_w), axis=1)
                    tmp = cuda.cumisc.sum(tmp.reshape(n, out_c), axis=0)
                    self.gb += tmp

            # TODO(beam2d): Use streams
            gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw)
            col_mats = self.col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            gy_mats = gy[0].reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.add_dot(
                    gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle)

            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            gcol = cuda.empty_like(self.col)
            gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle,
                                  out=gcol_mats[i])

            gx = conv.col2im_gpu(
                gcol, self.sy, self.sx, self.ph, self.pw, h, w)

        return gx,
    def backward_gpu(self, x, gy):
        n, out_c, out_h, out_w = x[0].shape
        c, h, w = gy[0].shape[1:]
        gx = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32)
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            gy_desc = cudnn.get_tensor_desc(gy[0], h, w)
            gx_desc = cudnn.get_tensor_desc(gx, out_h, out_w)

            algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, algo).value
            workspace = cuda.empty(
                (max(workspace_size // 4, 1),), dtype=numpy.float32)

            libcudnn.cudnnConvolutionForward(
                handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                self.filter_desc.value, cudnn.get_ptr(self.W),
                self.conv_desc.value, algo, cudnn.get_ptr(
                    workspace), workspace_size,
                0, gx_desc.value, cudnn.get_ptr(gx))
            # bias backward
            if self.b is not None:
                libcudnn.cudnnConvolutionBackwardBias(
                    handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                    1, self.bias_desc.value, cudnn.get_ptr(self.gb))
            # filter backward
            libcudnn.cudnnConvolutionBackwardFilter(
                handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                gx_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value,
                1, self.filter_desc.value, cudnn.get_ptr(self.gW))
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw)

            # TODO(beam2d): Use streams
            handle = cuda.get_cublas_handle()
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            col_mats = col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            gx_mats = gx.reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, col_mats[i], handle=handle,
                                  out=gx_mats[i])
            # bias backward
            if self.gb is not None:
                # TODO(beam2d): Unify kernels
                with cuda.using_cumisc(handle):
                    tmp = cuda.cumisc.sum(
                        gy[0].reshape(n * c, h * w), axis=1)
                    tmp = cuda.cumisc.sum(tmp.reshape(n, c), axis=0)
                    self.gb += tmp
            # filter backward
            # TODO(beam2d): Use streams
            gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw)
            x_mats = x[0].reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.add_dot(
                    x_mats[i], col_mats[i], gW_mat, transb='T', handle=handle)
        return gx,