示例#1
0
文件: sgd.py 项目: delta2323/chainer
 def update_core_gpu(self, param):
     grad = param.grad
     if grad is None:
         return
     cuda.elementwise('T grad, T lr', 'T param',
                      'param -= lr * grad',
                      'sgd')(grad, self.hyperparam.lr, param.data)
    def forward_gpu(self, x):
        if (cuda.cudnn_enabled and self.use_cudnn and
                pooling_nd._check_cudnn_acceptable_type(x[0].dtype)):
            # With cuDNN v3 or greater, use cuDNN implementation for inputs
            # with spatial dimensions of two or more.
            if _cudnn_version >= 3000 and self.ndim >= 2:
                return super(AveragePoolingND, self).forward_gpu(x)
            # With cuDNN v2, use cuDNN implementation only for inputs with
            # spatial dimensions of two.
            elif self.ndim == 2:
                return super(AveragePoolingND, self).forward_gpu(x)

        n, c = x[0].shape[:2]
        dims = x[0].shape[2:]
        ys = tuple(conv_nd.get_conv_outsize(d, k, s, p,
                                            cover_all=self.cover_all)
                   for (d, k, s, p) in six.moves.zip(
                       dims, self.ksize, self.stride, self.pad))
        # (n, c, y_1, y_2, ..., y_N)
        y_shape = (n, c) + ys
        y = cuda.cupy.empty(y_shape, dtype=x[0].dtype)
        coeff = 1. / functools.reduce(operator.mul, self.ksize)

        in_params, out_params, operation, name = \
            average_pooling_nd_kernel.AveragePoolingNDKernelForward.generate(
                self.ndim)
        cuda.elementwise(in_params, out_params, operation, name)(
            x[0].reduced_view(),
            *(dims + ys + self.ksize + self.stride + self.pad + (coeff, y)))

        return y,
    def backward_gpu(self, x, gy):
        if (cuda.cudnn_enabled and self.use_cudnn and
                pooling_nd._check_cudnn_acceptable_type(x[0].dtype)):
            # With cuDNN v3 or greater, use cuDNN implementation for inputs
            # with spatial dimensions of two or more.
            if _cudnn_version >= 3000 and self.ndim >= 2:
                return super(AveragePoolingND, self).backward_gpu(x, gy)
            # With cuDNN v2, use cuDNN implementation only for inputs with
            # spatial dimensions of two.
            elif self.ndim == 2:
                return super(AveragePoolingND, self).backward_gpu(x, gy)

        n, c = x[0].shape[:2]
        dims = x[0].shape[2:]
        ys = gy[0].shape[2:]
        gx = cuda.cupy.empty_like(x[0])
        coeff = 1. / functools.reduce(operator.mul, self.ksize)

        in_params, out_params, operation, name = \
            average_pooling_nd_kernel.AveragePoolingNDKernelBackward.generate(
                self.ndim)
        cuda.elementwise(in_params, out_params, operation, name)(
            gy[0].reduced_view(),
            *(dims + ys + self.ksize + self.stride + self.pad + (coeff, gx)))

        return gx,
示例#4
0
 def update_one_gpu(self, param, grad, v):
     cuda.elementwise(
         '''float* param, const float* grad, float* v,
            float lr, float momentum''',
         '''v[i] = momentum * v[i] - lr * grad[i];
            param[i] += v[i];''',
         'momentum_sgd')(param, grad, v, self.lr, self.momentum)
示例#5
0
    def backward_gpu(self, x, gy):
        if self._used_cudnn:
            return super(AveragePooling2D, self).backward_gpu(x, gy)

        n, c, h, w = self._in_shape
        y_h, y_w = gy[0].shape[2:]
        gx = cuda.cupy.empty(self._in_shape, self._in_dtype)
        coeff = 1. / (self.kh * self.kw)
        cuda.elementwise(
            'raw T gy, int32 h, int32 w,'
            'int32 out_h, int32 out_w, int32 kh, int32 kw,'
            'int32 sy, int32 sx, int32 ph, int32 pw, T coeff',
            'T gx',
            '''
               int c0 = i / (h * w);
               int y  = i / w % h + ph;
               int x  = i % w + pw;
               int out_y_0 = max(0,     (y - kh + sy) / sy);
               int out_y_1 = min(out_h, (y      + sy) / sy);
               int out_x_0 = max(0,     (x - kw + sx) / sx);
               int out_x_1 = min(out_w, (x      + sx) / sx);
               int hc0  = out_h * c0;

               T val = 0;
               for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
                 for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
                   val = val + gy[out_x + out_w * (out_y + hc0)];
                 }
               }
               gx = val * coeff;
            ''', 'avg_pool_bwd')(gy[0].reduced_view(),
                                 h, w, y_h, y_w, self.kh, self.kw,
                                 self.sy, self.sx, self.ph, self.pw, coeff,
                                 gx)
        return gx,
def _cu_conv_sum(y, x, n):
    # Convolutional sum
    # TODO(beam2d): Use scan computation
    rdim = x.size // (x.shape[0] * x.shape[1])
    cuda.elementwise(
        'float* y, const float* x, int rdim, int N, int n_',
        '''
          int half_n = n_ / 2;
          int offset = i / rdim * N * rdim + i % rdim;
          float* xi = x + offset;
          float* yi = y + offset;

          float sum_part = 0;
          for (int j = 0; j < N + half_n; ++j) {
            if (j < N) {
              sum_part += xi[j * rdim];
            }
            if (j >= n_) {
              sum_part -= xi[(j - n_) * rdim];
            }
            if (j >= half_n) {
              yi[(j - half_n) * rdim] = sum_part;
            }
          }
        ''', 'lrn_conv_sum')(y, x, rdim, x.shape[1], n,
                             range=slice(0, x.shape[0] * rdim, 1))
示例#7
0
 def forward_gpu(self, x):
     y = x[1].copy()
     cuda.elementwise(
         'float* y, float* b, int nc',
         'y[i] = b[i / nc] - y[i];',
         'sub_bias')(y, x[0], x[1].shape[1])
     return y,
示例#8
0
 def backward_gpu(self, x, gy):
     cuda.elementwise(
         'T gy, int32 x, int32 n_out', 'raw T gW',
         'atomicAdd(&gW[x * n_out + i % n_out], gy)',
         'embed_id_bwd')(
             gy[0], x[0][:, numpy.newaxis], self.gW.shape[1], self.gW)
     return None,
示例#9
0
文件: conv.py 项目: wakamori/chainer
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False):
    n, c, h, w = img.shape
    out_h = get_conv_outsize(h, kh, sy, ph, cover_all)
    out_w = get_conv_outsize(w, kw, sx, pw, cover_all)

    col = cuda.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype)
    cuda.elementwise(
        '''
           float* col, const float* img, int h, int w,
           int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw
        ''', '''
           int c0 = i / (kh * kw * out_h * out_w);
           int ky = i / (kw * out_h * out_w) % kh;
           int kx = i / (out_h * out_w) % kw;
           int out_y = i / out_w % out_h;
           int out_x = i % out_w;

           int in_y = ky + out_y * sy - ph;
           int in_x = kx + out_x * sx - pw;
           if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) {
             col[i] = img[in_x + w * (in_y + h * c0)];
           } else {
             col[i] = 0;
           }
        ''',
        'im2col')(col, img, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw)
    return col
示例#10
0
文件: conv.py 项目: ronekko/chainer
def col2im_gpu(col, sy, sx, ph, pw, h, w, dy=1, dx=1):
    n, c, kh, kw, out_h, out_w = col.shape
    img = cuda.cupy.empty((n, c, h, w), dtype=col.dtype)
    cuda.elementwise(
        "raw T col, int32 h, int32 w, int32 out_h, int32 out_w,"
        "int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,"
        "int32 dx, int32 dy",
        "T img",
        """
           int c0 = i / (h * w);
           int y  = i / w % h;
           int x  = i % w;
           T val = 0;
           for (int ky = 0; ky < kh; ++ky) {
             int out_y = (y + ph - ky * dy);
             if (0 > out_y || out_y >= out_h * sy) continue;
             if (out_y % sy != 0) continue;
             out_y /= sy;
             for (int kx = 0; kx < kw; ++kx) {
               int out_x = (x + pw - kx * dx);
               if (0 > out_x || out_x >= out_w * sx) continue;
               if (out_x % sx != 0) continue;
               out_x /= sx;
               int k = out_y + out_h * (kx + kw * (ky + kh * c0));
               val = val + col[out_x + out_w * k];
             }
           }
           img = val;
        """,
        "col2im",
    )(col.reduced_view(), h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dx, dy, img)
    return img
示例#11
0
文件: conv.py 项目: dsanno/chainer
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False, dy=1, dx=1):
    n, c, h, w = img.shape
    out_h = get_conv_outsize(h, kh, sy, ph, cover_all, dy)
    assert out_h > 0, 'Height in the output should be positive.'
    out_w = get_conv_outsize(w, kw, sx, pw, cover_all, dx)
    assert out_w > 0, 'Width in the output should be positive.'

    col = cuda.cupy.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype)
    cuda.elementwise(
        'raw T img, int32 h, int32 w, int32 out_h, int32 out_w,'
        'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw,'
        'int32 dy, int32 dx',
        'T col',
        '''
           int c0 = i / (kh * kw * out_h * out_w);
           int ky = i / (kw * out_h * out_w) % kh;
           int kx = i / (out_h * out_w) % kw;
           int out_y = i / out_w % out_h;
           int out_x = i % out_w;
           int in_y = ky * dy + out_y * sy - ph;
           int in_x = kx * dx + out_x * sx - pw;
           if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) {
             col = img[in_x + w * (in_y + h * c0)];
           } else {
             col = 0;
           }
        ''',
        'im2col')(img.reduced_view(),
                  h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, dy, dx, col)
    return col
示例#12
0
文件: conv.py 项目: 2php/chainer
def im2col_gpu(img, kh, kw, sy, sx, ph, pw, cover_all=False):
    n, c, h, w = img.shape
    out_h = get_conv_outsize(h, kh, sy, ph, cover_all)
    out_w = get_conv_outsize(w, kw, sx, pw, cover_all)

    col = cuda.cupy.empty((n, c, kh, kw, out_h, out_w), dtype=img.dtype)
    cuda.elementwise(
        'raw T img, int32 h, int32 w, int32 out_h, int32 out_w,'
        'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw',
        'T col',
        '''
           int c0 = i / (kh * kw * out_h * out_w);
           int ky = i / (kw * out_h * out_w) % kh;
           int kx = i / (out_h * out_w) % kw;
           int out_y = i / out_w % out_h;
           int out_x = i % out_w;

           int in_y = ky + out_y * sy - ph;
           int in_x = kx + out_x * sx - pw;
           if (in_y >= 0 && in_y < h && in_x >= 0 && in_x < w) {
             col = img[in_x + w * (in_y + h * c0)];
           } else {
             col = 0;
           }
        ''',
        'im2col')(img.reduced_view(),
                  h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, col)
    return col
示例#13
0
    def forward(self, inputs):
        c_prev, x = inputs
        a, i, f, o = _extract_gates(x)
        batch = len(x)

        if isinstance(x, numpy.ndarray):
            self.a = numpy.tanh(a)
            self.i = _sigmoid(i)
            self.f = _sigmoid(f)
            self.o = _sigmoid(o)

            c_next = numpy.empty_like(c_prev)
            c_next[:batch] = self.a * self.i + self.f * c_prev[:batch]
            h = self.o * numpy.tanh(c_next[:batch])
        else:
            c_next = cuda.cupy.empty_like(c_prev)
            h = cuda.cupy.empty_like(c_next[:batch])
            cuda.elementwise(
                'T c_prev, T a, T i_, T f, T o', 'T c, T h',
                '''
                    COMMON_ROUTINE;
                    c = aa * ai + af * c_prev;
                    h = ao * tanh(c);
                ''',
                'lstm_fwd', preamble=_preamble)(
                    c_prev[:batch], a, i, f, o, c_next[:batch], h)

        c_next[batch:] = c_prev[batch:]
        self.c = c_next[:batch]
        return c_next, h
示例#14
0
文件: conv.py 项目: wakamori/chainer
def col2im_gpu(col, sy, sx, ph, pw, h, w):
    n, c, kh, kw, out_h, out_w = col.shape

    img = cuda.empty((n, c, h, w), dtype=col.dtype)
    cuda.elementwise(
        '''
           float* img, const float* col, int h, int w,
           int out_h, int out_w, int kh, int kw, int sy, int sx, int ph, int pw
        ''', '''
           int c0 = i / (h * w);
           int y  = i / w % h + ph;
           int x  = i % w + pw;

           int out_y_0 = max(0,     (y - kh + sy) / sy);
           int out_y_1 = min(out_h, (y      + sy) / sy);
           int out_x_0 = max(0,     (x - kw + sx) / sx);
           int out_x_1 = min(out_w, (x      + sx) / sx);

           float val = 0;
           for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
             int ky = y - out_y * sy;
             for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
               int kx = x - out_x * sx;
               val += col[out_x + out_w * (out_y + out_h * (kx + kw * (ky + kh * c0)))];
             }
           }
           img[i] = val;
        ''',
        'col2im')(img, col, h, w, out_h, out_w, kh, kw, sy, sx, ph, pw)
    return img
示例#15
0
文件: conv.py 项目: 2php/chainer
def col2im_gpu(col, sy, sx, ph, pw, h, w):
    n, c, kh, kw, out_h, out_w = col.shape

    img = cuda.cupy.empty((n, c, h, w), dtype=col.dtype)
    cuda.elementwise(
        'raw T col, int32 h, int32 w, int32 out_h, int32 out_w,'
        'int32 kh, int32 kw, int32 sy, int32 sx, int32 ph, int32 pw',
        'T img',
        '''
           int c0 = i / (h * w);
           int y  = i / w % h + ph;
           int x  = i % w + pw;

           int out_y_0 = max(0,     (y - kh + sy) / sy);
           int out_y_1 = min(out_h, (y      + sy) / sy);
           int out_x_0 = max(0,     (x - kw + sx) / sx);
           int out_x_1 = min(out_w, (x      + sx) / sx);

           T val = 0;
           for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
             int ky = y - out_y * sy;
             for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
               int kx = x - out_x * sx;
               int k = out_y + out_h * (kx + kw * (ky + kh * c0));
               val = val + col[out_x + out_w * k];
             }
           }
           img = val;
        ''',
        'col2im')(col.reduced_view(),
                  h, w, out_h, out_w, kh, kw, sy, sx, ph, pw, img)
    return img
示例#16
0
 def backward_gpu(self, inputs, grad_outputs):
     gx = cuda.empty_like(inputs[0])
     cuda.elementwise(
         'float* y, const float* b, const int n_channel',
         'y[i] = b[i / n_channel]',
         'sum_axis_bwd')(gx, grad_outputs[0], gx.shape[1])
     return gx,
示例#17
0
    def forward_gpu(self, x):
        if cudnn.enabled and self.use_cudnn:
            return super(AveragePooling2D, self).forward_gpu(x)

        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        y = cuda.empty((n, c, y_h, y_w), dtype=numpy.float32)
        coeff = 1. / (self.kh * self.kw)

        cuda.elementwise(
            '''
               float* out, const float* in, int h, int w, int out_h, int out_w,
               int kh, int kw, int sy, int sx, int ph, int pw, float coeff
            ''', '''
               int c0    = i / (out_h * out_w);
               int out_y = i / out_w % out_h;
               int out_x = i % out_w;
               int in_y_0 = max(0, out_y * sy - ph);
               int in_y_1 = min(h, out_y * sy + kh - ph);
               int in_x_0 = max(0, out_x * sx - pw);
               int in_x_1 = min(w, out_x * sx + kw - pw);

               float val = 0;
               for (int y = in_y_0; y < in_y_1; ++y) {
                 int offset_y = w * (y + h * c0);
                 for (int x = in_x_0; x < in_x_1; ++x) {
                   val += in[x + offset_y];
                 }
               }
               out[i] = val * coeff;
            ''', 'avg_pool_fwd')(y, x[0], h, w, y_h, y_w, self.kh, self.kw,
                                 self.sy, self.sx, self.ph, self.pw, coeff)
        return y,
示例#18
0
    def backward_gpu(self, x, gy):
        if cudnn.enabled and self.use_cudnn:
            return super(AveragePooling2D, self).backward_gpu(x, gy)

        n, c, h, w = x[0].shape
        y_h, y_w   = gy[0].shape[2:]
        gx = cuda.empty_like(x[0])
        coeff = 1. / (self.kh * self.kw)

        cuda.elementwise(
            '''
               float* gx, const float* gy, int h, int w, int out_h, int out_w,
               int kh, int kw, int sy, int sx, int ph, int pw, float coeff
            ''', '''
               int c0 = i / (h * w);
               int y  = i / w % h + ph;
               int x  = i % w + pw;
               int out_y_0 = max(0,     (y - kh + sy) / sy);
               int out_y_1 = min(out_h, (y      + sy) / sy);
               int out_x_0 = max(0,     (x - kw + sx) / sx);
               int out_x_1 = min(out_w, (x      + sx) / sx);
               int hc0  = out_h * c0;

               float val = 0;
               for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
                 for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
                   val += gy[out_x + out_w * (out_y + hc0)];
                 }
               }
               gx[i] = val * coeff;
            ''', 'avg_pool_bwd')(gx, gy[0], h, w, y_h, y_w, self.kh, self.kw,
                                 self.sy, self.sx, self.ph, self.pw, coeff)
        return gx,
示例#19
0
    def backward_gpu(self, x, gy):
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            gx = cuda.empty_like(x[0])
            desc = cudnn.get_tensor_desc(x[0], 1, 1)
            libcudnn.cudnnSoftmaxBackward(
                handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(
                    self.y),
                desc.value, cudnn.get_ptr(gy[0]), 0, desc.value,
                cudnn.get_ptr(gx))
        else:
            gx = self.y * gy[0]
            c = gx.shape[1]
            sum_ydy = cuda.empty((gx.shape[0],), dtype=numpy.float32)
            cuda.elementwise(
                'float* sum_ydy, const float* ydy, int c',
                '''
                   const float* row = ydy + i * c;
                   float sum = 0;
                   for (int j = 0; j < c; ++j) {
                     sum += row[j];
                   }
                   sum_ydy[i] = sum;
                ''', 'softmax_bwd_sum_ydy')(sum_ydy, gx, c)
            cuda.elementwise(
                'float* gx, const float* y, const float* sum_ydy, int c',
                'gx[i] -= y[i] * sum_ydy[i / c]',
                'softmax_bwd_diff')(gx, self.y, sum_ydy, c)

        return gx,
示例#20
0
 def backward_gpu(self, x, gy):
     gx0 = cuda.empty_like(x[0])
     cuda.elementwise(
         'float* gx0, const float* x0, const float* gy',
         'gx0[i] = ((x0[i] > 0) - (x0[i] < 0)) * gy[i]',
         'abs_bwd')(gx0, x[0], gy[0])
     return gx0,
示例#21
0
文件: maxout.py 项目: hillbig/chainer
    def backward(self, inputs, grad_outputs):
        gy = grad_outputs[0]
        x = _as_mat(inputs[0])
        W = inputs[1]

        xp = cuda.get_array_module(*inputs)
        # gradient of z = xW + b
        gz = xp.zeros((gy.shape[0], W.shape[1], gy.shape[1]), x.dtype)
        if xp == numpy:
            idx0 = xp.arange(len(gy))[:, None]
            idx1 = xp.arange(gy.shape[1])
            gz[idx0, self.argmax, idx1] = gy
        else:
            gz_r = xp.rollaxis(gz, 1)
            cuda.elementwise(
                'T gy, S argmax, int32 n', 'raw T gz',
                'gz[argmax * n + i] = gy', 'maxout_bwd'
            )(gy, self.argmax, gz_r.size // len(gz_r), gz_r)
        gx = xp.tensordot(gz, W, ((1, 2), (1, 2))).reshape(inputs[0].shape)
        gW = xp.tensordot(x, gz, (0, 0))

        if len(inputs) == 3:
            gb = gz.sum(axis=0)
            return gx, gW, gb
        else:
            return gx, gW
示例#22
0
    def forward_gpu(self, x):
        if (chainer.should_use_cudnn('>=auto') and
                pooling_nd._check_cudnn_acceptable_type(x[0].dtype)):
            # With cuDNN v3 or greater, use cuDNN implementation for inputs
            # with spatial dimensions of two or more.
            if _cudnn_version >= 3000 and self.ndim >= 2:
                return super(MaxPoolingND, self).forward_gpu(x)
            # With cuDNN v2, use cuDNN implementation only for inputs with
            # spatial dimensions of two.
            elif self.ndim == 2:
                return super(MaxPoolingND, self).forward_gpu(x)

        self.retain_inputs(())
        self._in_shape = x[0].shape
        self._in_dtype = x[0].dtype

        n, c = x[0].shape[:2]
        dims = x[0].shape[2:]
        ys = tuple(conv_nd.get_conv_outsize(d, k, s, p, self.cover_all)
                   for (d, k, s, p) in six.moves.zip(
                       dims, self.ksize, self.stride, self.pad))
        # (n, c, y_1, y_2, ..., y_N)
        y_shape = (n, c) + ys
        y = cuda.cupy.empty(y_shape, dtype=x[0].dtype)
        self.indexes = cuda.cupy.empty(y_shape, dtype=numpy.int32)

        in_params, out_params, operation, name = \
            max_pooling_nd_kernel.MaxPoolingNDKernelForward.generate(self.ndim)
        cuda.elementwise(in_params, out_params, operation, name)(
            x[0].reduced_view(),
            *(dims + ys + self.ksize + self.stride + self.pad +
              (y, self.indexes)))

        return y,
示例#23
0
    def forward_gpu(self, inputs):
        if self._used_cudnn:
            x, = self.mpool2d.get_retained_inputs()
            return self._forward_gpu_compute_indexes_again((x.data, inputs[0]))
        else:
            x, = inputs
            n, c, h, w = x.shape
            y_h = conv.get_conv_outsize(
                h, self.kh, self.sy, self.ph, self.cover_all)
            assert y_h > 0, 'Height in the output should be positive.'
            y_w = conv.get_conv_outsize(
                w, self.kw, self.sx, self.pw, self.cover_all)
            assert y_w > 0, 'Width in the output should be positive.'
            y = cuda.cupy.empty((n, c, y_h, y_w), dtype=x.dtype)

            cuda.elementwise(
                'raw T in, raw S indexes, int32 h, int32 w, int32 out_h,'
                'int32 out_w, int32 kh, int32 kw, int32 sy, int32 sx,'
                'int32 ph, int32 pw', 'T out',
                '''
                int c0    = i / (out_h * out_w);
                int out_y = i / out_w % out_h;
                int out_x = i % out_w;
                int index = indexes[i];
                int max_y = max(0, out_y * sy - ph + index / kw);
                int max_x = max(0, out_x * sx - pw + index % kw);
                out = in[max_x + w * (max_y + h * c0)];
                ''', 'max_pool_grad_fwd')(
                    x.reduced_view(), self.indexes.reduced_view(), h, w,
                    y_h, y_w, self.kh, self.kw, self.sy, self.sx, self.ph,
                    self.pw, y)
            return y,
示例#24
0
    def backward(self, inputs, grad_outputs):
        xp = cuda.get_array_module(*inputs)
        x, W = inputs
        gy = grad_outputs[0]
        gW = xp.zeros_like(W)

        if xp is numpy:
            # It is equivalent to `numpy.add.at(gW, x, gy)` but ufunc.at is
            # too slow.
            for ix, igy in six.moves.zip(x.ravel(),
                                         gy.reshape(x.size, -1)):
                if ix == self.ignore_label:
                    continue
                gW[ix] += igy
        else:
            if self.ignore_label is None:
                cuda.elementwise(
                    'T gy, int32 x, int32 n_out', 'raw T gW',
                    'int w_ind[] = {x, i % n_out}; atomicAdd(&gW[w_ind], gy)',
                    'embed_id_bwd')(
                        gy, xp.expand_dims(x, -1), gW.shape[1], gW)
            else:
                cuda.elementwise(
                    'T gy, int32 x, int32 n_out, int32 ignore', 'raw T gW',
                    '''
                    if (x != ignore) {
                      int w_ind[] = {x, i % n_out};
                      atomicAdd(&gW[w_ind], gy);
                    }
                    ''',
                    'embed_id_bwd_ignore_label')(
                        gy, xp.expand_dims(x, -1), gW.shape[1],
                        self.ignore_label, gW)
        return None, gW
示例#25
0
    def forward_gpu(self, x):
        if chainer.should_use_cudnn('>=auto') and 2 <= self.ndim <= 3:
            # With cuDNN v3 or greater, use cuDNN implementation for inputs
            # with spatial dimensions of two or more.
            return super(AveragePoolingND, self).forward_gpu(x)

        self.retain_inputs(())
        self._in_shape = x[0].shape
        self._in_dtype = x[0].dtype

        n, c = x[0].shape[:2]
        dims = x[0].shape[2:]
        ys = tuple(conv_nd.get_conv_outsize(d, k, s, p,
                                            cover_all=self.cover_all)
                   for (d, k, s, p) in six.moves.zip(
                       dims, self.ksize, self.stride, self.pad))
        # (n, c, y_1, y_2, ..., y_N)
        y_shape = (n, c) + ys
        y = cuda.cupy.empty(y_shape, dtype=x[0].dtype)
        coeff = 1. / functools.reduce(operator.mul, self.ksize)

        in_params, out_params, operation, name = \
            average_pooling_nd_kernel.AveragePoolingNDKernelForward.generate(
                self.ndim)
        cuda.elementwise(in_params, out_params, operation, name)(
            x[0].reduced_view(),
            *(dims + ys + self.ksize + self.stride + self.pad + (coeff, y)))

        return y,
示例#26
0
 def backward_gpu(self, x, gy):
     gx = cuda.empty_like(x[0])
     cuda.elementwise(
         'float* gx, const float* x, const float* gy, float value',
         'gx[i] = value * __powf(x[i], value - 1) * gy[i]',
         'pow_var_const_bwd')(gx, x[0], gy[0], self.value)
     return gx,
示例#27
0
 def forward_gpu(self, x):
     y = cuda.empty((x[0].size, self.W.shape[1]), dtype=numpy.float32)
     cuda.elementwise(
         'float* y, const float* W, const int* x, int n_out',
         'y[i] = W[x[i / n_out] * n_out + i % n_out]',
         'embed_id_fwd')(y, self.W, x[0], self.W.shape[1])
     return y,
示例#28
0
 def backward_gpu(self, inputs, grad_outputs):
     cupy = cuda.cupy
     x, t = inputs
     if hasattr(self, 'y'):
         y = self.y
     else:
         y = log_softmax._log_softmax(x, self.use_cudnn)
         cupy.exp(y, out=y)
     gloss = grad_outputs[0]
     n_unit = t.size // len(t)
     coeff = gloss * self._coeff
     if self.class_weight is None:
         gx = cuda.elementwise(
             'T y, S t, raw T coeff, S n_channel, S n_unit',
             'T gx',
             '''
                 const int c = (i / n_unit % n_channel);
                 gx = (t == -1) ? 0 : (coeff[0] * (y - (c == t)));
             ''',
             'softmax_crossent_bwd')(
                 y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit)
     else:
         gx = cuda.elementwise(
             'T y, raw T w, S t, raw T coeff, S n_channel, S n_unit',
             'T gx',
             '''
                 const int c = (i / n_unit % n_channel);
                 gx = t == -1 ? 0 : coeff[0] * (y - (c == t)) * w[t];
             ''',
             'softmax_crossent_bwd')(
                 y, self.class_weight, cupy.expand_dims(t, 1), coeff,
                 x.shape[1], n_unit)
     return gx, None
示例#29
0
 def update_one_gpu(self, param, grad, ms):
     cuda.elementwise(
         '''float* param, const float* grad, float* ms,
            float lr, float alpha, float eps''',
         '''ms[i] = alpha * ms[i] + (1 - alpha) * grad[i] * grad[i];
            param[i] -= lr * grad[i] / (sqrtf(ms[i]) + eps);''',
         'rmsprop')(param, grad, ms, self.lr, self.alpha, self.eps)
示例#30
0
 def backward_gpu(self, x, gy):
     gx = cuda.empty_like(x[0])
     cuda.elementwise(
         'float* gx, const float* x, const float* gy, float value',
         'gx[i] = -value * gy[i] / (x[i] * x[i])',
         'div_from_const_bwd')(gx, x[0], gy[0], self.value)
     return gx,
    def backward_gpu(self, x, gy):
        if cuda.cudnn_enabled and self.use_cudnn:
            return super(MaxPooling2D, self).backward_gpu(x, gy)

        n, c, h, w = x[0].shape
        y_h, y_w = gy[0].shape[2:]
        gx = cuda.empty_like(x[0])

        cuda.elementwise(
            'raw T gy, raw S indexes, int32 h, int32 w,'
            'int32 out_h, int32 out_w, int32 kh, int32 kw,'
            'int32 sy, int32 sx, int32 ph, int32 pw', 'T gx', '''
               int c0 = i / (h * w);
               int y  = i / w % h + ph;
               int x  = i % w + pw;
               int out_y_0 = max(0,     (y - kh + sy) / sy);
               int out_y_1 = min(out_h, (y      + sy) / sy);
               int out_x_0 = max(0,     (x - kw + sx) / sx);
               int out_x_1 = min(out_w, (x      + sx) / sx);

               float val = 0;
               for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
                 int ky = y - out_y * sy;
                 for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
                   int kx = x - out_x * sx;
                   int offset = out_x + out_w * (out_y + out_h * c0);
                   if (indexes[offset] == kx + kw * ky) {
                     val += gy[offset];
                   }
                 }
               }
               gx = val;
            ''', 'max_pool_bwd')(gy[0].reduced_view(),
                                 self.indexes.reduced_view(), h, w, y_h, y_w,
                                 self.kh, self.kw, self.sy, self.sx, self.ph,
                                 self.pw, gx)
        return gx,
示例#32
0
    def backward_gpu(self, inputs, grad_outputs):
        cupy = cuda.cupy
        x, t, W = inputs
        gloss, = grad_outputs

        n_in = x.shape[1]
        gx = cupy.zeros_like(x)
        gW = cupy.zeros_like(W)
        cuda.elementwise(
            '''T wxy, raw T x, raw T w, raw int32 ts, raw int32 paths,
            raw T codes, raw int32 begins, raw T gloss,
            int32 c, int32 max_length''', 'raw T gx, raw T gw', '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];
              T code = codes[p];

              T g = -gloss[0] * code / (1.0 + exp(wxy));
              for (int j = 0; j < c; ++j) {
                int w_ind[] = {node, j};
                int x_ind[] = {ind, j};
                atomicAdd(&gx[x_ind], g * w[w_ind]);
                atomicAdd(&gw[w_ind], g * x[x_ind]);
              }
            }
            ''',
            'binary_hierarchical_softmax_bwd')(self.wxy, x, W, t, self.paths,
                                               self.codes, self.begins, gloss,
                                               n_in, self.max_length, gx, gW)
        return gx, None, gW
示例#33
0
    def forward_gpu(self, inputs):
        x, t = inputs
        n_in = x.shape[1]
        self._make_samples(t)

        self.wx = cuda.elementwise(
            'raw T W, raw T x, S k, int32 c, int32 m', 'T wx',
            '''
            T f = 0;
            for (int j = 0; j < c; ++j) {
              f += x[(i / m) * c + j] * W[k * c + j];
            }
            wx = f;
            ''',
            'negative_sampling_wx'
        )(self.W, x, self.samples, n_in, self.sample_size + 1)

        y = cuda.elementwise(
            'T wx, int32 c, int32 m', 'T y',
            '''
            T f = wx;
            if (i % m == 0) {
              f = -f;
            }
            T loss;
            if (f < 0) {
              loss = __logf(1 + __expf(f));
            } else {
              loss = f + __logf(1 + __expf(-f));
            }
            y = loss;
            ''',
            'negative_sampling_forward'
        )(self.wx, n_in, self.sample_size + 1)
        # TODO(okuta): merge elementwise
        loss = cuda.cupy.sum(y)
        return loss,
示例#34
0
    def backward_gpu(self, inputs, loss):
        x, t = inputs
        gloss, = loss

        n_in = x.shape[1]
        gx = cuda.zeros_like(x)
        cuda.elementwise(
            '''const float* wxy, float* gx, float* gw, const float* x,
            const float* w, const int* ts, const int* paths,
            const float* codes, const int* begins,
            const float* gloss, int c, int max_length''',
            '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];
              float code = codes[p];
              gx = &gx[ind * c];
              x = &x[ind * c];

              float g = -*gloss * code / (1.0 + exp(wxy[i]));
              for (int j = 0; j < c; ++j) {
                atomicAdd(gx + j, g * w[node * c + j]);
                atomicAdd(gw + node * c + j, g * x[j]);
              }
            }
            ''',
            'binary_hierarchical_softmax_bwd'
        )(self.wxy, gx, self.gW, x, self.W, t, self.paths, self.codes,
          self.begins, gloss, n_in, self.max_length)
        return gx, None
示例#35
0
    def forward(self, inputs):
        self.retain_inputs((0, 1, 2))
        x, gamma, gy = inputs
        expander = self.expander
        inv_m = gamma.dtype.type(1. / (x.size // gamma.size))
        xp = cuda.get_array_module(x)

        if self.use_cudnn:
            cudnn_mode = self.mode.get_cudnn_mode()
            x = cuda.cupy.ascontiguousarray(x)
            gamma = cuda.cupy.ascontiguousarray(gamma)
            gy = cuda.cupy.ascontiguousarray(gy)
            dtype = x.dtype
            handle = cudnn.get_handle()
            x_desc = cudnn.create_tensor_descriptor(_as4darray(x))
            derivedBnDesc = cudnn.create_uninitialized_tensor_descriptor()
            libcudnn.deriveBNTensorDescriptor(derivedBnDesc.value,
                                              x_desc.value, cudnn_mode)
            one = numpy.array(1, dtype=dtype).ctypes
            zero = numpy.array(0, dtype=dtype).ctypes
            gx = cuda.cupy.empty_like(x)
            ggamma = cuda.cupy.empty_like(gamma)
            gbeta = cuda.cupy.empty_like(gamma)
            libcudnn.batchNormalizationBackward(
                handle, cudnn_mode, one.data, zero.data, one.data, zero.data,
                x_desc.value, x.data.ptr, x_desc.value, gy.data.ptr,
                x_desc.value, gx.data.ptr, derivedBnDesc.value, gamma.data.ptr,
                ggamma.data.ptr, gbeta.data.ptr, self.eps, self.mean.data.ptr,
                self.inv_std.data.ptr)
        else:
            gbeta = gy.sum(axis=self.axis)
            x_hat = _x_hat(x, self.mean[expander], self.inv_std[expander])
            ggamma = (gy * x_hat).sum(axis=self.axis)
            if xp is numpy:
                gx = (gamma * self.inv_std)[expander] * (
                    gy - (x_hat * ggamma[expander] + gbeta[expander]) * inv_m)
            else:
                gx = cuda.elementwise(
                    '''
                    T gy, T x_hat, T gamma, T inv_std, T ggamma, T gbeta,
                    T inv_m
                    ''', 'T gx', '''
                    gx = (gamma * inv_std) * (
                        gy - (x_hat * ggamma + gbeta) * inv_m)
                    ''', 'bn_bwd')(gy, x_hat, gamma[expander],
                                   self.inv_std[expander], ggamma[expander],
                                   gbeta[expander], inv_m)
        self.retain_outputs((0, 1))
        return gx, ggamma, gbeta
示例#36
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(self.class_weight.reshape(shape),
                                       x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label', 'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0',
                'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1],
                                self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out', '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(),
                                                       log_y.shape[-1],
                                                       self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
示例#37
0
    def forward_gpu(self, inputs):
        x, W, gy = inputs
        masked = cuda.elementwise('T x, T cond, T gy', 'T masked',
                                  'masked = cond >= 0 ? (T)0 : (T)(x * gy)',
                                  'prelu_masked')(x, self.cond, gy)

        if self.reduce_axes is None:
            gW = masked.copy()
        else:
            gW = masked.sum(axis=self.reduce_axes)

        gx = masked  # reuse buffer
        _fwd_kern()(gy, self.cond, W.reshape(self.extended_shape), gx)
        self.retain_inputs((0, 1, 2))
        return gx, gW
示例#38
0
 def backward(self, inputs, grads):
     c_prev, x, u = inputs
     gc, gh = grads
     if gc is None:
         gc = 0
     if gh is None:
         gh = 0
     n_unit = x.shape[1]
     x_bar = u[:, 0:n_unit]
     f_in = u[:, n_unit:n_unit * 2]
     r_in = u[:, n_unit * 2:]
     gc_prev = cuda.cupy.empty_like(c_prev)
     gx = cuda.cupy.empty_like(x)
     gu = cuda.cupy.empty_like(u)
     gx_bar = gu[:, 0:n_unit]
     gf_in = gu[:, n_unit:n_unit * 2]
     gr_in = gu[:, n_unit * 2:]
     cuda.elementwise(
         '''T c_prev, T x, T x_bar, T f_in, T r_in, int32 n_unit,
         T gc, T gh, T c''',
         'T gc_prev, T gx, T gx_bar, T gf_in, T gr_in',
         '''
         float f = sigmoid(f_in);
         float r = sigmoid(r_in);
         gx = gh * (1 - r);
         float tanh_c = tanh(c);
         float g = gh * r * grad_tanh(tanh_c) + gc;
         gc_prev = g * f;
         gx_bar = g * (1 - f);
         gf_in = g * grad_sigmoid(f) * (c_prev - x_bar);
         gr_in = gh * grad_sigmoid(r) * (tanh_c - x);
         ''',
         'sru_backward',
         preamble=_preamble)(c_prev, x, x_bar, f_in, r_in, n_unit, gc, gh,
                             self.c, gc_prev, gx, gx_bar, gf_in, gr_in)
     return gc_prev, gx, gu
示例#39
0
    def backward_gpu(self, x, gy):
        if (cuda.cudnn_enabled and self.use_cudnn
                and pooling_nd._check_cudnn_acceptable_type(x[0].dtype)):
            # With cuDNN v3 or greater, use cuDNN implementation for inputs
            # with spatial dimensions of two or more.
            if _cudnn_version >= 3000 and self.ndim >= 2:
                return super(MaxPoolingND, self).backward_gpu(x, gy)
            # With cuDNN v2, use cuDNN implementation only for inputs with
            # spatial dimensions of two.
            elif self.ndim == 2:
                return super(MaxPoolingND, self).backward_gpu(x, gy)

        n, c = x[0].shape[:2]
        dims = x[0].shape[2:]
        ys = gy[0].shape[2:]
        gx = cuda.cupy.empty_like(x[0])

        ndim = self.ndim
        in_params, out_params, operation, name = \
            max_pooling_nd_kernel.MaxPoolingNDKernelBackward.generate(ndim)
        cuda.elementwise(in_params, out_params, operation, name)(
            gy[0].reduced_view(), self.indexes.reduced_view(),
            *(dims + ys + self.ksize + self.stride + self.pad + (gx, )))
        return gx,
 def backward_gpu(self, inputs, grad_outputs):
     cupy = cuda.cupy
     x, t = inputs
     gloss = grad_outputs[0]
     n_unit = t.size // len(t)
     coeff = gloss * self._coeff
     gx = cuda.elementwise(
         'T y, S t, raw T coeff, S n_channel, S n_unit, raw T weights',
         'T gx', '''
            const int c = (i / n_unit % n_channel);
            gx = ((t == -1) || (c != t)) ? 0 : ((weights[t]*coeff[0]) / max(y, 1e-5));
         ''', 'crossent_bwd')(self.y, cupy.expand_dims(t, 1),
                              -coeff, x.shape[1], n_unit,
                              self.weights.reduced_view())
     return gx, None
def _cu_conv_sum(y, x, n):
    # Convolutional sum
    # TODO(beam2d): Use scan computation
    rdim = x.size // (x.shape[0] * x.shape[1])
    cuda.elementwise(
        'raw T x, int32 rdim, int32 N, int32 n_', 'raw T y',
        '''
          int half_n = n_ / 2;
          int offset = i / rdim * N * rdim + i % rdim;

          float sum_part = 0;
          for (int j = 0; j < N + half_n; ++j) {
            if (j < N) {
              sum_part += x[offset + j * rdim];
            }
            if (j >= n_) {
              sum_part -= x[offset + (j - n_) * rdim];
            }
            if (j >= half_n) {
              y[offset + (j - half_n) * rdim] = sum_part;
            }
          }
        ''', 'lrn_conv_sum')(x, rdim, x.shape[1], n, y,
                             size=x.shape[0] * rdim)
示例#42
0
 def backward(self, inputs, grads):
     xp = cuda.get_array_module(*inputs)
     _, indices, _ = inputs
     g = grads[0]
     if xp is numpy:
         gv = g[range(len(indices)), indices]
         g[range(len(indices)), indices] = 0
     else:
         dim = g.shape[2]
         shape = (indices.shape[0], dim)
         gv = cuda.cupy.empty(shape, g.dtype)
         cuda.elementwise(
             'S t, int32 d',
             'raw T s, T y',
             '''
             int b = i / d;
             int k = i - b * d;
             int ind[] = {b, t, k};
             y = s[ind];
             s[ind] = 0;
             ''',
             'thin_stack_set_bwd'
         )(indices[:, None], dim, g, gv)
     return g, None, gv
示例#43
0
 def sample_gpu(self, shape):
     ps = cuda.cupy.random.uniform(size=shape, dtype=numpy.float32)
     vs = cuda.elementwise(
         'T ps, raw T threshold , raw S values, int32 b', 'int32 vs', '''
         T pb = ps * b;
         int index = __float2int_rd(pb);
         // fill_uniform sometimes returns 1.0, so we need to check index
         if (index >= b) {
           index = 0;
         }
         int lr = threshold[index] < pb - index;
         vs = values[index * 2 + lr];
         ''', 'walker_alias_sample')(ps, self.threshold, self.values,
                                     len(self.threshold))
     return vs
示例#44
0
    def forward_gpu(self, x):
        if cuda.cudnn_enabled and self.use_cudnn:
            return super(AveragePooling2D, self).forward_gpu(x)

        n, c, h, w = x[0].shape
        y_h = conv.get_conv_outsize(h, self.kh, self.sy, self.ph)
        y_w = conv.get_conv_outsize(w, self.kw, self.sx, self.pw)
        y = cuda.cupy.empty((n, c, y_h, y_w), dtype=numpy.float32)
        coeff = 1. / (self.kh * self.kw)
        cuda.elementwise(
            'raw T in, int32 h, int32 w,'
            'int32 out_h, int32 out_w, int32 kh, int32 kw,'
            'int32 sy, int32 sx, int32 ph, int32 pw, T coeff',
            'T out',
            '''
               int c0    = i / (out_h * out_w);
               int out_y = i / out_w % out_h;
               int out_x = i % out_w;
               int in_y_0 = max(0, out_y * sy - ph);
               int in_y_1 = min(h, out_y * sy + kh - ph);
               int in_x_0 = max(0, out_x * sx - pw);
               int in_x_1 = min(w, out_x * sx + kw - pw);

               float val = 0;
               for (int y = in_y_0; y < in_y_1; ++y) {
                 int offset_y = w * (y + h * c0);
                 for (int x = in_x_0; x < in_x_1; ++x) {
                   val += in[x + offset_y];
                 }
               }
               out = val * coeff;
            ''', 'avg_pool_fwd')(x[0].reduced_view(),
                                 h, w, y_h, y_w, self.kh, self.kw,
                                 self.sy, self.sx, self.ph, self.pw, coeff,
                                 y)
        return y,
示例#45
0
    def __call__(self, opt):
        if cuda.available:
            kernel = cuda.elementwise('T s, T decay', 'T g', 'g += decay * s',
                                      'lasso')

        rate = self.rate
        for param in opt.target.params():
            p, g = param.data, param.grad
            xp = cuda.get_array_module(p)
            sign = xp.sign(p)
            with cuda.get_device(p) as dev:
                if int(dev) == -1:
                    g += rate * sign
                else:
                    kernel(sign, rate, g)
示例#46
0
    def backward_gpu(self, inputs, grad_outputs):
        cupy = cuda.cupy
        x, t = inputs
        if hasattr(self, 'y'):
            y = self.y
        else:
            y = log_softmax._log_softmax(x)
            cupy.exp(y, out=y)
        gloss = grad_outputs[0]
        n_unit = t.size // len(t)
        if self.reduce == 'mean':
            coeff = gloss * self._coeff
        else:
            coeff = gloss[:, None, ...]

        if self.class_weight is None:
            gx = cuda.elementwise(
                'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label',
                'T gx', '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t));
                ''', 'softmax_crossent_bwd')(y, cupy.expand_dims(t, 1), coeff,
                                             x.shape[1], n_unit,
                                             self.ignore_label)
        else:
            gx = cuda.elementwise(
                'T y, raw T w, S t, T coeff, S n_channel, S n_unit, '
                'S ignore_label', 'T gx', '''
                    const int c = (i / n_unit % n_channel);
                    gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t];
                ''', 'softmax_crossent_weight_bwd')(y, self.class_weight,
                                                    cupy.expand_dims(t, 1),
                                                    coeff, x.shape[1], n_unit,
                                                    self.ignore_label)

        return gx, None
示例#47
0
    def backward_gpu(self, x, gy):
        if (cuda.cudnn_enabled and self.use_cudnn and
                pooling_2d._check_cudnn_acceptable_type(x[0].dtype)):
            return super(AveragePooling2D, self).backward_gpu(x, gy)

        n, c, h, w = x[0].shape
        y_h, y_w = gy[0].shape[2:]
        gx = cuda.cupy.empty_like(x[0])
        coeff = 1. / (self.kh * self.kw)
        cuda.elementwise(
            'raw T gy, int32 h, int32 w,'
            'int32 out_h, int32 out_w, int32 kh, int32 kw,'
            'int32 sy, int32 sx, int32 ph, int32 pw, T coeff',
            'T gx',
            '''
               int c0 = i / (h * w);
               int y  = i / w % h + ph;
               int x  = i % w + pw;
               int out_y_0 = max(0,     (y - kh + sy) / sy);
               int out_y_1 = min(out_h, (y      + sy) / sy);
               int out_x_0 = max(0,     (x - kw + sx) / sx);
               int out_x_1 = min(out_w, (x      + sx) / sx);
               int hc0  = out_h * c0;

               T val = 0;
               for (int out_y = out_y_0; out_y < out_y_1; ++out_y) {
                 for (int out_x = out_x_0; out_x < out_x_1; ++out_x) {
                   val = val + gy[out_x + out_w * (out_y + hc0)];
                 }
               }
               gx = val * coeff;
            ''', 'avg_pool_bwd')(gy[0].reduced_view(),
                                 h, w, y_h, y_w, self.kh, self.kw,
                                 self.sy, self.sx, self.ph, self.pw, coeff,
                                 gx)
        return gx,
示例#48
0
 def backward_gpu(self, x, gys):
     gx = cuda.zeros_like(x[0])
     coffset = 0
     kernel = cuda.elementwise(_args,
                               'COPY(x[idx] = y[i])',
                               'split_bwd',
                               preamble=_preamble)
     for gy in gys:
         if gy is None:
             continue
         cdimy = gy.shape[self.axis]
         if cdimy != 0:
             kernel(gy, gx, cdimy, self.cdimx, self.rdim, coffset)
         coffset += cdimy
     return gx,
示例#49
0
    def backward_gpu(self, inputs, grad_outputs):
        x, y, z = inputs
        gw, = grad_outputs

        gx, gy = cuda.elementwise(
            'T x, T y, T gw',
            'T gx, T gy',
            '''
               gx = y * gw;
               gy = x * gw;
            ''',
            'muladd_bwd')(x, y, gw)

        gz = gw
        return gx, gy, gz
示例#50
0
    def __call__(self, opt):
        if cuda.available:
            kernel = cuda.elementwise(
                'T low, T high', 
                'T p', 
                'p = (p < low) ? low : (p > high) ? high : p',
                'weight_clip')

        for param in opt.target.params():
            p = param.data
            with cuda.get_device(p) as dev:
                if int(dev) == -1:
                    numpy.clip(p, self.low, self.high)
                else:
                    kernel(self.low, self.high, p)
示例#51
0
    def forward_gpu(self, inputs):
        x, gamma, beta = inputs

        mean = x.mean(axis=(0, 1), keepdims=True)
        var = x.var(axis=(0, 1), keepdims=True) + self.eps

        normalize = cuda.elementwise(
            'T x, T var, T mean, T gamma, T beta', 'T std, T x_hat, T y',
            'std = sqrtf(var);'
            'x_hat = (x - mean) / std;'
            'y = gamma * x_hat + beta;', 'normalize')

        self.std, self.x_hat, y = normalize(x, var, mean, gamma, beta)

        return y,
示例#52
0
 def backward_gpu(self, x, gy):
     if cuda.cudnn_enabled and self.use_cudnn:
         gx = cuda.empty_like(x[0])
         handle = cudnn.get_handle()
         desc = cudnn.create_tensor_descriptor(_as4darray(self.y))
         libcudnn.activationBackward(handle, _mode, ctypes.c_float(1),
                                     desc.value, self.y.data.ptr,
                                     desc.value, gy[0].data.ptr,
                                     desc.value, x[0].data.ptr,
                                     ctypes.c_float(0), desc.value,
                                     gx.data.ptr)
     else:
         gx = cuda.elementwise('T x, T gy', 'T gx', 'gx = x > 0 ? gy : 0',
                               'relu_bwd')(x[0], gy[0])
     return gx,
示例#53
0
    def __call__(self, opt):
        if cuda.available:
            kernel = cuda.elementwise('T p, T decay', 'T g', 'g += decay * p',
                                      'weight_decay')

        rate = self.rate
        for name, param in opt.target.namedparams():
            if name == 'b' or name.endswith('/b'):
                continue
            p, g = param.data, param.grad
            with cuda.get_device(p) as dev:
                if int(dev) == -1:
                    g += rate * p
                else:
                    kernel(p, rate, g)
示例#54
0
 def backward_gpu(self, x, gy):
     df0 = x[0]
     df1 = x[1]
     v = x[2]
     gx0, gx1 = cuda.elementwise(
         'T df_plus, T df_minus, T v, T g', 'T gx0, T gx1', '''
             if(v>0){
                 gx0 = g;
                 gx1 = 0;
             }else{
                 gx0 = 0;
                 gx1 = g;
             }
         ''', 'upwind_b')(df0, df1, v, gy[0])
     return gx0, gx1, None
示例#55
0
 def backward_gpu(self, inputs, gy):
     W, log_sigma2 = inputs
     gy = gy[0]
     gW, gs = cuda.elementwise(
         'T W, T ls, T gy, T eps, T lo_th, T up_th',
         'T gW, T gs',
         '''
         T square_W = W * W + eps;
         T y = ls - log(square_W);
         gs = ((y > lo_th) & (y < up_th))? gy : (T)0;
         gW = - gs / square_W * 2 * W;
         ''',
         'log_alpha_bwd')(
             W, log_sigma2, gy,
             self.eps, self.lower_threshold, self.upper_threshold)
     return gW, gs
示例#56
0
 def _create_reduction_kernel(shape0, expr1, expr2):
     return cuda.elementwise(
         '''
             float* ret1, float* ret2,
             const float* x, const float* y,
             float alpha, int shape12
         ''', '''
             float sum1 = 0, sum2 = 0;
             for (int j = 0; j < {0}; j++) {{
                 int I = j * shape12 + i;
                 sum1 += {1};
                 sum2 += {2};
             }}
             ret1[i] = sum1 * alpha;
             ret2[i] = sum2 * alpha;
         '''.format(shape0, expr1, expr2), 'bn_asix02')
示例#57
0
    def forward_gpu(self, x):
        y = cuda.empty_like(x[0])
        if cudnn.enabled and self.use_cudnn:
            handle = cudnn.get_default_handle()
            desc = cudnn.get_tensor_desc(x[0], 1, 1)
            libcudnn.cudnnSoftmaxForward(
                handle, _algorithm, _mode, 1, desc.value, cudnn.get_ptr(x[0]),
                0, desc.value, cudnn.get_ptr(y))
            self.y = y
        else:
            maxes = cuda.empty((x[0].shape[0],), dtype=numpy.float32)
            c = x[0].shape[1]
            cuda.elementwise(
                'float* maxes, const float* x, int c',
                '''
                   const float* row = x + i * c;
                   float maxval = row[0];
                   for (int j = 1; j < c; ++j) {
                     if (maxval < row[j]) {
                       maxval = row[j];
                     }
                   }
                   maxes[i] = maxval;
                ''', 'softmax_rowmax')(maxes, x[0], c)
            cuda.elementwise(
                'float* y, const float* x, const float* maxes, int c',
                'y[i] = __expf(x[i] - maxes[i / c])',
                'softmax_exp')(y, x[0], maxes, c)
            coeff = maxes  # reuse memory
            cuda.elementwise(
                'float* coeff, const float* y, int c',
                '''
                   const float* row = y + i * c;
                   float sum = 0;
                   for (int j = 0; j < c; ++j) {
                     sum += row[j];
                   }
                   coeff[i] = 1 / sum;
                ''', 'softmax_invrowsum')(coeff, y, c)
            cuda.elementwise(
                'float* y, const float* coeff, int c', 'y[i] *= coeff[i / c]',
                'softmax_rowmul')(y, coeff, c)
            self.y = y

        return y,
示例#58
0
    def backward_gpu(self, inputs, grad_outputs):
        xp = cuda.get_array_module(*inputs)
        t, gloss = inputs[1], grad_outputs[0]
        self.bottom_diff = cuda.elementwise('S t, int32 dim',
                                            'raw T bottom_diff',
                                            'bottom_diff[i * dim + t] *= -1',
                                            'hinge_bwd')(t, inputs[0].shape[1],
                                                         self.bottom_diff)
        if self.norm == 'L1':
            gx = (gloss / t.shape[0]) * xp.sign(self.bottom_diff)
        elif self.norm == 'L2':
            gx = (2 * gloss / t.shape[0]) * self.bottom_diff
        else:
            raise NotImplementedError()

        return gx, None
示例#59
0
 def forward_gpu(self, inputs):
     log_alpha = inputs[0]
     reg = cuda.elementwise(
         'T la, T clip',
         'T reg',
         '''
         const T half = 0.5;
         const T c063576 = 0.63576;
         reg = (c063576 * 
                (tanh(((T)1.87320 + (T)1.48695 * la) * half) * half + half) 
                - half * log1p(exp(-la)) - c063576) * ((T)1.0 - clip);
         ''',
         'kl_fwd')(
             log_alpha, self.clip_mask)
     reg = utils.force_array(- reg.sum() / log_alpha.size, log_alpha.dtype)
     return reg,
示例#60
0
 def backward_gpu(self, inputs, grad_outputs):
     cupy = cuda.cupy
     x, t = inputs
     gloss = grad_outputs[0]
     n_unit = x.size // (x.shape[0] * x.shape[1])
     if getattr(self, 'normalize', True):
         count = x.shape[0] * n_unit
     else:
         count = x.shape[0]
     coeff = cuda.cupy.divide(gloss, count, dtype=gloss.dtype)
     gx = cuda.elementwise(
         'T y, S t, raw T coeff, S n_channel, S n_unit', 'T gx',
         'gx = coeff[0] * (y - (t == (i / n_unit % n_channel)))',
         'softmax_crossent_bwd')(self.y, cupy.expand_dims(t, 1), coeff,
                                 x.shape[1], n_unit)
     return gx, None