Exemplo n.º 1
0
    def forward_gpu(self, x):
        i_len, j_len = array.as_mat(x[0]).shape
        k_len = array.as_mat(x[1]).shape[1]
        l_len = self.W.shape[2]

        # When indices are enclosed with [], they are 'flatten'
        # (i.e. linealized as 1-D array)
        # ij->[ij]
        e1 = array.as_vec(x[0])
        # ik->[ik]
        e2 = array.as_vec(x[1])
        e1e2 = cuda.empty(i_len * j_len * k_len, dtype=numpy.float32)
        # '[ij],[ik]->[ijk]'
        cuda.elementwise(
            'float* y, float* e1, float* e2, int e1c, int e2c',
            '''
            int I = i / e1c / e2c;
            int J = (i - I * e1c * e2c) / e2c;
            int K = i % e2c;
            y[i] = e1[I * e1c + J] * e2[I * e2c + K];
            ''',
            'row_wise_outer_product')(
                e1e2, e1, e2, j_len, k_len)

        # [ijk]->i[jk]
        e1e2 = e1e2.reshape(i_len, j_len * k_len)

        # jkl->[jk]l
        W_mat = self.W.reshape(
            self.W.shape[0] * self.W.shape[1], self.W.shape[2])

        y = cuda.empty((i_len, l_len), dtype=numpy.float32)
        with cuda.using_cumisc():
            # 'i[jk],[jk]l->il'
            cuda.culinalg.dot(e1e2, W_mat, out=y)

        if not self.nobias:
            e1 = array.as_mat(x[0])
            e2 = array.as_mat(x[1])
            with cuda.using_cumisc():
                # ij,jl->il
                cuda.culinalg.add_dot(e1, self.V1, y)
                # ik,kl->il
                cuda.culinalg.add_dot(e2, self.V2, y)
            cuda.elementwise(
                'float* y, float* b, int n_channel',
                'y[i] += b[i % n_channel]',
                'linear_bias')(y, self.b, self.b.size)
        return y,
Exemplo n.º 2
0
 def backward_gpu(self, x, gy):
     glhs = None
     if self.lhs_bwd:
         with cuda.using_cumisc():
             s = x[0].shape
             glhs = cuda.cumisc.sum(gy[0], 1).reshape(s) / s[1]
     return glhs, -gy[0]
Exemplo n.º 3
0
    def forward_gpu(self, x):
        i_len, j_len = array.as_mat(x[0]).shape
        k_len = array.as_mat(x[1]).shape[1]
        l_len = self.W.shape[2]

        # When indices are enclosed with [], they are 'flatten'
        # (i.e. linealized as 1-D array)
        # ij->[ij]
        e1 = array.as_vec(x[0])
        # ik->[ik]
        e2 = array.as_vec(x[1])
        e1e2 = cuda.empty(i_len * j_len * k_len, dtype=numpy.float32)
        # '[ij],[ik]->[ijk]'
        cuda.elementwise(
            'float* y, float* e1, float* e2, int e1c, int e2c', '''
            int I = i / e1c / e2c;
            int J = (i - I * e1c * e2c) / e2c;
            int K = i % e2c;
            y[i] = e1[I * e1c + J] * e2[I * e2c + K];
            ''', 'row_wise_outer_product')(e1e2, e1, e2, j_len, k_len)

        # [ijk]->i[jk]
        e1e2 = e1e2.reshape(i_len, j_len * k_len)

        # jkl->[jk]l
        W_mat = self.W.reshape(self.W.shape[0] * self.W.shape[1],
                               self.W.shape[2])

        y = cuda.empty((i_len, l_len), dtype=numpy.float32)
        with cuda.using_cumisc():
            # 'i[jk],[jk]l->il'
            cuda.culinalg.dot(e1e2, W_mat, out=y)

        if not self.nobias:
            e1 = array.as_mat(x[0])
            e2 = array.as_mat(x[1])
            with cuda.using_cumisc():
                # ij,jl->il
                cuda.culinalg.add_dot(e1, self.V1, y)
                # ik,kl->il
                cuda.culinalg.add_dot(e2, self.V2, y)
            cuda.elementwise('float* y, float* b, int n_channel',
                             'y[i] += b[i % n_channel]',
                             'linear_bias')(y, self.b, self.b.size)
        return y,
Exemplo n.º 4
0
 def backward_gpu(self, x, gy):
     _x = _as_mat(x[0])
     gx = cuda.empty_like(_x)
     with cuda.using_cumisc():
         cuda.culinalg.add_dot(gy[0], _x, self.gW, transa='T')
         if self.gb is not None:
             self.gb += cuda.cumisc.sum(gy[0], 0)
         cuda.culinalg.dot(gy[0], self.W, out=gx)
     return gx.reshape(x[0].shape),
Exemplo n.º 5
0
 def forward_gpu(self, x):
     x = _as_mat(x[0])
     y = cuda.empty((x.shape[0], self.W.shape[0]), dtype=x.dtype)
     with cuda.using_cumisc():
         cuda.culinalg.dot(x, self.W, transb='T', out=y)
     if self.b is not None:
         cuda.elementwise('float* y, float* b, int n_channel',
                          'y[i] += b[i % n_channel]',
                          'linear_bias')(y, self.b, self.b.size)
     return y,
Exemplo n.º 6
0
def _cumean_axis02(x):
    with cuda.using_cumisc():
        if x.shape[2] > 1:
            # cumisc.mean does not support more than two dimensions
            shape = x.shape
            x = x.reshape(shape[0] * shape[1], shape[2])
            x = cuda.cumisc.mean(x, axis=1)
            x = x.reshape(shape[0], shape[1])
        else:
            x = x.reshape(x.shape[:2])
        return cuda.cumisc.mean(x, axis=0)
Exemplo n.º 7
0
def _matmul_gpu(a, b, transa=False, transb=False, transout=False, out=None):
    if transout:
        # (A B)^T = B^T A^T
        a, b, transa, transb = b, a, not transb, not transa
    a = _as_mat(a)
    b = _as_mat(b)
    with cuda.using_cumisc():
        return cuda.culinalg.dot(a, b,
                                 transa=_as_trans_op(transa),
                                 transb=_as_trans_op(transb),
                                 out=out)
Exemplo n.º 8
0
 def forward_gpu(self, x):
     x = _as_mat(x[0])
     y = cuda.empty((x.shape[0], self.W.shape[0]), dtype=x.dtype)
     with cuda.using_cumisc():
         cuda.culinalg.dot(x, self.W, transb='T', out=y)
     if self.b is not None:
         cuda.elementwise(
             'float* y, float* b, int n_channel',
             'y[i] += b[i % n_channel]',
             'linear_bias')(y, self.b, self.b.size)
     return y,
Exemplo n.º 9
0
def _matmul_gpu(a, b, transa=False, transb=False, transout=False, out=None):
    if transout:
        # (A B)^T = B^T A^T
        a, b, transa, transb = b, a, not transb, not transa
    a = _as_mat(a)
    b = _as_mat(b)
    with cuda.using_cumisc():
        return cuda.culinalg.dot(a, b,
                                 transa=_as_trans_op(transa),
                                 transb=_as_trans_op(transb),
                                 out=out)
Exemplo n.º 10
0
    def backward_gpu(self, x, gy):
        out_c, out_h, out_w = gy[0].shape[1:]
        n, c, h, w = x[0].shape

        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            x_desc = cudnn.get_tensor_desc(x[0], h, w)
            gy_desc = cudnn.get_tensor_desc(gy[0], out_h, out_w)
            if self.b is not None:
                libcudnn.cudnnConvolutionBackwardBias(
                    handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                    1, self.bias_desc.value, cudnn.get_ptr(self.gb))

            libcudnn.cudnnConvolutionBackwardFilter(
                handle, 1, x_desc.value, cudnn.get_ptr(x[0]),
                gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value,
                1, self.filter_desc.value, cudnn.get_ptr(self.gW))

            gx = cuda.empty_like(x[0])
            libcudnn.cudnnConvolutionBackwardData(
                handle, 1, self.filter_desc.value, cudnn.get_ptr(self.W),
                gy_desc.value, cudnn.get_ptr(gy[0]), self.conv_desc.value,
                0, x_desc.value, cudnn.get_ptr(gx))
        else:
            handle = cuda.get_cublas_handle()
            if self.gb is not None:
                # TODO(beam2d): Unify kernels
                with cuda.using_cumisc(handle):
                    tmp = cuda.cumisc.sum(
                        gy[0].reshape(n * out_c, out_h * out_w), axis=1)
                    tmp = cuda.cumisc.sum(tmp.reshape(n, out_c), axis=0)
                    self.gb += tmp

            # TODO(beam2d): Use streams
            gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw)
            col_mats = self.col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            gy_mats = gy[0].reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.add_dot(
                    gy_mats[i], col_mats[i], gW_mat, transb='T', handle=handle)

            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            gcol = cuda.empty_like(self.col)
            gcol_mats = gcol.reshape(n, c * self.kh * self.kw, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, gy_mats[i], transa='T', handle=handle,
                                  out=gcol_mats[i])

            gx = conv.col2im_gpu(
                gcol, self.sy, self.sx, self.ph, self.pw, h, w)

        return gx,
Exemplo n.º 11
0
    def backward_gpu(self, inputs, grad_outputs):
        ngauss = self.y.shape[1]
        gradients = tuple(cuda.empty_like(i) for i in inputs[:-2]) # w/o (x1, x2)
        args = gradients + inputs
        ysum = None
        with cuda.using_cumisc():
            ysum = cuda.cumisc.sum(self.y, 1).reshape((self.y.shape[0], 1))
        cuda.elementwise(
            '''
            const float* r,
            const float* rsum,
            const int ngauss,
            float* gw,
            float* gm1, float* gm2,
            float* gs1, float* gs2,
            float* gc,
            const float* w,
            const float* m1, const float* m2,
            const float* s1, const float* s2,
            const float* c,
            const float* x1, const float* x2
            ''',
            '''
            const int j = i / ngauss;
            const float z1 = (x1[j] - m1[i]) / s1[i];
            const float z2 = (x2[j] - m2[i]) / s2[i];
            const float z3 = 1.0f / (1.0f - pow(c[i], 2.0f));
            const float z4 = pow(z1 - c[i] * z2, 2.0f);
            const float z5 = - r[i] / rsum[j];

            gw[i] = w[i] + z5;
            gm1[i] = z3 / s1[i] * (z1 - c[i] * z2);
            gm2[i] = z3 / s2[i] * (z2 - c[i] * z1);
            gs1[i] = (x1[j] - m1[i]) * gm1[i] - 1.0f;
            gs2[i] = (x2[j] - m2[i]) * gm2[i] - 1.0f;
            gc[i]  = z1 * z2 + c[i] * (1.0f - z3 * z4);

            gm1[i] *= z5;
            gm2[i] *= z5;
            gs1[i] *= z5;
            gs2[i] *= z5;
            gc[i]  *= z5;
            ''',
            'gaussian_mixture_2d_bwd'
        )(self.y, ysum, ngauss, *args)
        # tuple(- self.y * g for g in gradients[1:])
        return gradients + (None, None)  # for target signals
Exemplo n.º 12
0
    def backward_gpu(self, x, gy):
        ldim = x[0].shape[0]
        cdim = self.W.size
        rdim = x[0].size // (ldim * cdim)

        masked = cuda.empty_like(x[0])
        cuda.elementwise(
            "float* masked, const float* x, const float* gy", "masked[i] = x[i] >= 0 ? 0 : x[i] * gy[i]", "prelu_masked"
        )(masked, x[0], gy[0])

        with cuda.using_cumisc():
            rsum = cuda.cumisc.sum(masked.reshape(ldim * cdim, rdim), axis=1)
            gW = cuda.cumisc.sum(rsum.reshape(ldim, cdim), axis=0)
            self.gW += gW.reshape(self.gW.shape)
            del rsum, gW

        gx = masked  # reuse buffer
        _fwd_kern()(gx, gy[0], x[0], self.W, cdim, rdim)
        return (gx,)
Exemplo n.º 13
0
    def backward_gpu(self, x, gy):
        ldim = x[0].shape[0]
        cdim = self.W.size
        rdim = x[0].size // (ldim * cdim)

        masked = cuda.empty_like(x[0])
        cuda.elementwise('float* masked, const float* x, const float* gy',
                         'masked[i] = x[i] >= 0 ? 0 : x[i] * gy[i]',
                         'prelu_masked')(masked, x[0], gy[0])

        with cuda.using_cumisc():
            rsum = cuda.cumisc.sum(masked.reshape(ldim * cdim, rdim), axis=1)
            gW = cuda.cumisc.sum(rsum.reshape(ldim, cdim), axis=0)
            self.gW += gW.reshape(self.gW.shape)
            del rsum, gW

        gx = masked  # reuse buffer
        _fwd_kern()(gx, gy[0], x[0], self.W, cdim, rdim)
        return gx,
Exemplo n.º 14
0
def _cusum_axis02(x, y=None, expr1='x[I]', expr2='x[I] * x[I]', mean=False):
    with cuda.using_cumisc():
        shape = x.shape
        ret1 = cuda.empty_like(x[0])
        ret2 = cuda.empty_like(x[0])
        if y is None:
            y = x
        alpha = 1.0
        if mean:
            alpha = 1.0 / (shape[0] * shape[2])

        # In most cases shape[0] is constant.
        # Therefore, the kernel is compiled only once.
        # If shape[0] is small, Compiler will perform loop unrolling.
        _create_reduction_kernel(shape[0], expr1, expr2)(
                ret1, ret2, x, y, alpha, shape[1] * shape[2])

        if shape[2] != 1:
            ret1 = _partial_reduce(ret1)
            ret2 = _partial_reduce(ret2)
        ret_shape = (1, shape[1], 1)
        return (ret1.reshape(ret_shape), ret2.reshape(ret_shape))
Exemplo n.º 15
0
def _cusum_axis02(x, y=None, expr1='x[I]', expr2='x[I] * x[I]', mean=False):
    with cuda.using_cumisc():
        shape = x.shape
        ret1 = cuda.empty_like(x[0])
        ret2 = cuda.empty_like(x[0])
        if y is None:
            y = x
        alpha = 1.0
        if mean:
            alpha = 1.0 / (shape[0] * shape[2])

        # In most cases shape[0] is constant.
        # Therefore, the kernel is compiled only once.
        # If shape[0] is small, Compiler will perform loop unrolling.
        _create_reduction_kernel(shape[0], expr1, expr2)(
            ret1, ret2, x, y, alpha, shape[1] * shape[2])

        if shape[2] != 1:
            ret1 = _partial_reduce(ret1)
            ret2 = _partial_reduce(ret2)
        ret_shape = (1, shape[1], 1)
        return (ret1.reshape(ret_shape), ret2.reshape(ret_shape))
Exemplo n.º 16
0
    def backward_gpu(self, x, gy):
        n, out_c, out_h, out_w = x[0].shape
        c, h, w = gy[0].shape[1:]
        gx = cuda.empty((n, out_c, out_h, out_w), dtype=numpy.float32)
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            gy_desc = cudnn.get_tensor_desc(gy[0], h, w)
            gx_desc = cudnn.get_tensor_desc(gx, out_h, out_w)

            algo = libcudnn.cudnnGetConvolutionForwardAlgorithm(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, _fwd_pref,
                self.max_workspace_size)
            workspace_size = libcudnn.cudnnGetConvolutionForwardWorkspaceSize(
                handle, gy_desc.value, self.filter_desc.value,
                self.conv_desc.value, gx_desc.value, algo).value
            workspace = cuda.empty(
                (max(workspace_size // 4, 1),), dtype=numpy.float32)

            libcudnn.cudnnConvolutionForward(
                handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                self.filter_desc.value, cudnn.get_ptr(self.W),
                self.conv_desc.value, algo, cudnn.get_ptr(
                    workspace), workspace_size,
                0, gx_desc.value, cudnn.get_ptr(gx))
            # bias backward
            if self.b is not None:
                libcudnn.cudnnConvolutionBackwardBias(
                    handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                    1, self.bias_desc.value, cudnn.get_ptr(self.gb))
            # filter backward
            libcudnn.cudnnConvolutionBackwardFilter(
                handle, 1, gy_desc.value, cudnn.get_ptr(gy[0]),
                gx_desc.value, cudnn.get_ptr(x[0]), self.conv_desc.value,
                1, self.filter_desc.value, cudnn.get_ptr(self.gW))
        else:
            # Implementation using im2col
            col = conv.im2col_gpu(
                gy[0], self.kh, self.kw, self.sy, self.sx, self.ph, self.pw)

            # TODO(beam2d): Use streams
            handle = cuda.get_cublas_handle()
            W_mat = self.W.reshape(out_c, c * self.kh * self.kw)
            col_mats = col.reshape(
                n, c * self.kh * self.kw, out_h * out_w)
            gx_mats = gx.reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.dot(W_mat, col_mats[i], handle=handle,
                                  out=gx_mats[i])
            # bias backward
            if self.gb is not None:
                # TODO(beam2d): Unify kernels
                with cuda.using_cumisc(handle):
                    tmp = cuda.cumisc.sum(
                        gy[0].reshape(n * c, h * w), axis=1)
                    tmp = cuda.cumisc.sum(tmp.reshape(n, c), axis=0)
                    self.gb += tmp
            # filter backward
            # TODO(beam2d): Use streams
            gW_mat = self.gW.reshape(out_c, c * self.kh * self.kw)
            x_mats = x[0].reshape(n, out_c, out_h * out_w)
            for i in moves.range(n):
                cuda.culinalg.add_dot(
                    x_mats[i], col_mats[i], gW_mat, transb='T', handle=handle)
        return gx,
Exemplo n.º 17
0
 def forward_gpu(self, inputs):
     with cuda.using_cumisc():
         return cuda.cumisc.sum(inputs[0], 1).reshape((inputs[0].shape[0], 1)),
Exemplo n.º 18
0
    def backward_gpu(self, x, gy):
        i_len, j_len = array.as_mat(x[0]).shape
        k_len = array.as_mat(x[1]).shape[1]
        l_len = gy[0].shape[1]

        # ij->[ij]
        e1 = array.as_vec(x[0])
        # ik->[ik]
        e2 = array.as_vec(x[1])
        gy, = gy
        # il->[il]
        gy_vec = array.as_vec(gy)
        # jkl->[jkl]
        W_vec = array.as_vec(self.W)

        dgW = cuda.empty((j_len * k_len * l_len, ), dtype=numpy.float32)
        # '[ij],[ik],[il]->[jkl]'
        cuda.elementwise(
            '''
            float* y, float* e1, float* e2, float* gy,
            int r, int e1c, int e2c, int gyc
            ''', '''
            int J = i / e2c / gyc;
            int K = (i - J * e2c * gyc) / gyc;
            int L = i % gyc;
            float yval = 0;
            for (int I = 0; I < r; ++I) {
                int e1idx = I * e1c + J;
                int e2idx = I * e2c + K;
                int gyidx = I * gyc + L;
                yval += e1[e1idx] * e2[e2idx] * gy[gyidx];
            }
            y[i] = yval;
            ''', 'sum_of_three_ary_tensor_product')(dgW, e1, e2, gy_vec, i_len,
                                                    j_len, k_len, l_len)
        # [jkl]->jkl
        self.gW += dgW.reshape((j_len, k_len, l_len))

        if not self.nobias:
            e1 = array.as_mat(x[0])
            e2 = array.as_mat(x[1])
            with cuda.using_cumisc():
                # ij,il->jl
                cuda.culinalg.add_dot(e1, gy, self.gV1, transa='T')
                # ik,il->kl
                cuda.culinalg.add_dot(e2, gy, self.gV2, transa='T')
                self.gb += cuda.cumisc.sum(gy, 0)

        ge1 = cuda.empty((i_len * j_len, ), dtype=numpy.float32)
        # '[ik],[jkl],[il]->[ij]'
        cuda.elementwise(
            '''
            float* y, float* e, float* W, float* gy,
            int ec, int gyc, int gec
            ''', '''
            int I = i / gec;
            int J = i % gec;
            float yval = 0;
            for (int K = 0; K < ec; ++K) {
                for (int L = 0; L < gyc; ++L) {
                    int eidx = I * ec + K;
                    int Widx = J * ec * gyc + K * gyc + L;
                    int gyidx = I * gyc + L;
                    yval += e[eidx] * W[Widx] * gy[gyidx];
                }
            }
            y[i] = yval;
            ''', 'ge_kernel')(ge1, e2, W_vec, gy_vec, k_len, l_len, j_len)
        # [ij]->ij
        ge1 = ge1.reshape(i_len, j_len)

        ge2 = cuda.empty((i_len * k_len, ), dtype=numpy.float32)
        # '[ij],[jkl],[il]->[ik]'
        cuda.elementwise(
            '''
            float* y, float* e, float* W, float* gy,
            int ec, int gyc, int gec
            ''', '''
            int I = i / gec;
            int K = i % gec;
            float yval = 0;
            for (int J = 0; J < ec; ++J) {
                for (int L = 0; L < gyc; ++L) {
                    int eidx = I * ec + J;
                    int Widx = J * gec * gyc + K * gyc + L;
                    int gyidx = I * gyc + L;
                    yval += e[eidx] * W[Widx] * gy[gyidx];
                }
            }
            y[i] = yval;
            ''', 'ge_kernel2')(ge2, e1, W_vec, gy_vec, j_len, l_len, k_len)
        # [ik]->ik
        ge2 = ge2.reshape(i_len, k_len)

        if not self.nobias:
            with cuda.using_cumisc():
                # il,jl->ij
                cuda.culinalg.add_dot(gy, self.V1, ge1, transb='T')
                # il,kl->ik
                cuda.culinalg.add_dot(gy, self.V2, ge2, transb='T')
        return (ge1.reshape(x[0].shape), ge2.reshape(x[1].shape))
Exemplo n.º 19
0
    def backward_gpu(self, x, gy):
        i_len, j_len = array.as_mat(x[0]).shape
        k_len = array.as_mat(x[1]).shape[1]
        l_len = gy[0].shape[1]

        # ij->[ij]
        e1 = array.as_vec(x[0])
        # ik->[ik]
        e2 = array.as_vec(x[1])
        gy, = gy
        # il->[il]
        gy_vec = array.as_vec(gy)
        # jkl->[jkl]
        W_vec = array.as_vec(self.W)

        dgW = cuda.empty((j_len * k_len * l_len,), dtype=numpy.float32)
        # '[ij],[ik],[il]->[jkl]'
        cuda.elementwise(
            '''
            float* y, float* e1, float* e2, float* gy,
            int r, int e1c, int e2c, int gyc
            ''',
            '''
            int J = i / e2c / gyc;
            int K = (i - J * e2c * gyc) / gyc;
            int L = i % gyc;
            float yval = 0;
            for (int I = 0; I < r; ++I) {
                int e1idx = I * e1c + J;
                int e2idx = I * e2c + K;
                int gyidx = I * gyc + L;
                yval += e1[e1idx] * e2[e2idx] * gy[gyidx];
            }
            y[i] = yval;
            ''',
            'sum_of_three_ary_tensor_product')(
                dgW, e1, e2, gy_vec, i_len, j_len, k_len, l_len)
        # [jkl]->jkl
        self.gW += dgW.reshape((j_len, k_len, l_len))

        if not self.nobias:
            e1 = array.as_mat(x[0])
            e2 = array.as_mat(x[1])
            with cuda.using_cumisc():
                # ij,il->jl
                cuda.culinalg.add_dot(e1, gy, self.gV1, transa='T')
                # ik,il->kl
                cuda.culinalg.add_dot(e2, gy, self.gV2, transa='T')
                self.gb += cuda.cumisc.sum(gy, 0)

        ge1 = cuda.empty((i_len * j_len,), dtype=numpy.float32)
        # '[ik],[jkl],[il]->[ij]'
        cuda.elementwise(
            '''
            float* y, float* e, float* W, float* gy,
            int ec, int gyc, int gec
            ''',
            '''
            int I = i / gec;
            int J = i % gec;
            float yval = 0;
            for (int K = 0; K < ec; ++K) {
                for (int L = 0; L < gyc; ++L) {
                    int eidx = I * ec + K;
                    int Widx = J * ec * gyc + K * gyc + L;
                    int gyidx = I * gyc + L;
                    yval += e[eidx] * W[Widx] * gy[gyidx];
                }
            }
            y[i] = yval;
            ''',
            'ge_kernel')(ge1, e2, W_vec, gy_vec, k_len, l_len, j_len)
        # [ij]->ij
        ge1 = ge1.reshape(i_len, j_len)

        ge2 = cuda.empty((i_len * k_len,), dtype=numpy.float32)
        # '[ij],[jkl],[il]->[ik]'
        cuda.elementwise(
            '''
            float* y, float* e, float* W, float* gy,
            int ec, int gyc, int gec
            ''',
            '''
            int I = i / gec;
            int K = i % gec;
            float yval = 0;
            for (int J = 0; J < ec; ++J) {
                for (int L = 0; L < gyc; ++L) {
                    int eidx = I * ec + J;
                    int Widx = J * gec * gyc + K * gyc + L;
                    int gyidx = I * gyc + L;
                    yval += e[eidx] * W[Widx] * gy[gyidx];
                }
            }
            y[i] = yval;
            ''',
            'ge_kernel2')(ge2, e1, W_vec, gy_vec, j_len, l_len, k_len)
        # [ik]->ik
        ge2 = ge2.reshape(i_len, k_len)

        if not self.nobias:
            with cuda.using_cumisc():
                # il,jl->ij
                cuda.culinalg.add_dot(gy, self.V1, ge1, transb='T')
                # il,kl->ik
                cuda.culinalg.add_dot(gy, self.V2, ge2, transb='T')
        return (ge1.reshape(x[0].shape), ge2.reshape(x[1].shape))