Пример #1
0
 def forward_gpu(self, inputs):
     self.retain_inputs((0,))
     x = inputs[0].reshape(len(inputs[0]), -1)
     l2normsquared_kernel = cuda.reduce(
         'T x', 'T y', 'x * x', 'a + b', 'y = a', '0', 'l2normsquared'
     )
     return l2normsquared_kernel(x, axis=1),
Пример #2
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs[:2]
        log_y = super(AdaptiveSoftmaxCrossEntropy, self).forward(inputs)[0]
        self.y = cupy.exp(log_y)

        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label', 'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0',
                'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1],
                                self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out', '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(),
                                                       log_y.shape[-1],
                                                       self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
 def forward_gpu(self, inputs):
     self.retain_inputs((0,))
     x = inputs[0].reshape(len(inputs[0]), -1)
     l2normsquared_kernel = cuda.reduce(
         'T x', 'T y', 'x * x', 'a + b', 'y = a', '0', 'l2normsquared'
     )
     return l2normsquared_kernel(x, axis=1),
Пример #4
0
    def forward_gpu(self, inputs):
        class_weight = backend.from_chx(self.class_weight)

        self.retain_inputs((0, 1))
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)

        if self.reduce == 'mean':
            # Reduction is performed in a promoted dtype
            reduc_dtype = _reduction_dtype(x.dtype)
            if self.normalize:
                count = (t != self.ignore_label).sum(dtype=reduc_dtype)
                count = cupy.maximum(1, count)
                coeff = 1. / count
            else:
                coeff = cupy.array(1. / max(1, len(t)), dtype=reduc_dtype)
            self._coeff = coeff

            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw U coeff, '
                'S ignore_label', 'U out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = static_cast<U>(a * -coeff[0])', '0',
                'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1],
                                self._coeff, self.ignore_label)
            ret = ret.astype(log_y.dtype, copy=False)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out', '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(),
                                                       log_y.shape[-1],
                                                       self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
Пример #5
0
def _logsumexp(a, xp, axis=None):
    vmax = xp.amax(a, axis=axis, keepdims=True)
    if xp is numpy:
        vmax += xp.log(
            xp.sum(xp.exp(a - vmax), axis=axis, keepdims=True, dtype=a.dtype))
    else:
        _logsumexp_impl = cuda.reduce('T x, T vmax', 'T y', 'exp(x - vmax)',
                                      'a + b', 'y += log(a)', '0',
                                      'logsumexp_impl')
        _logsumexp_impl(a, vmax, vmax, axis=axis, keepdims=True)
    return xp.squeeze(vmax, axis=axis)
Пример #6
0
def _logsumexp(a, xp, axis=None):
    vmax = xp.amax(a, axis=axis, keepdims=True)
    if xp is numpy:
        vmax += xp.log(xp.sum(xp.exp(a - vmax),
                              axis=axis, keepdims=True, dtype=a.dtype))
    else:
        _logsumexp_impl = cuda.reduce(
            'T x, T vmax', 'T y',
            'exp(x - vmax)', 'a + b', 'y += log(a)', '0',
            'logsumexp_impl')
        _logsumexp_impl(a, vmax, vmax, axis=axis, keepdims=True)
    return xp.squeeze(vmax, axis=axis)
Пример #7
0
    def forward_gpu(self, inputs):
        class_weight = backend.from_chainerx(self.class_weight)

        self.retain_inputs((0, 1))
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label',
                'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1],
              self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out',
                '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''',
                'softmax_crossent_no_reduce_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
Пример #8
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        if len(inputs) == 2:
            (x, t), tt = inputs, None
        else:
            x, t, tt = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(self.class_weight.reshape(shape),
                                       x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label', 'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0',
                'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1],
                                self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out', '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(),
                                                       log_y.shape[-1],
                                                       self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
Пример #9
0
    def normalize(arr):
        """normalize input array and return its norm
        from https://github.com/pfnet-research/sngan_projection/blob/master/source/functions/max_sv.py#L5

        :param arr: numpy ndarray or cupy ndarray
        :return: norm of input array
        """

        norm = cuda.reduce('T x', 'T out', 'x * x', 'a + b', 'out = sqrt(a)',
                           0, 'norm_sn')(arr)
        cuda.elementwise('T norm', 'T x', 'x /= (norm + 1e-20)',
                         'div_sn')(norm, arr)
        return norm
Пример #10
0
    def forward_gpu(self, inputs):
        x, t, W = inputs
        max_length = cuda.reduce(
            'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]',
            'max(a, b)', 'out = a', '0',
            'binary_hierarchical_softmax_max_length')(t, self.begins)
        max_length = cuda.to_cpu(max_length)[()]

        length = max_length * x.shape[0]
        ls = cuda.cupy.empty((length,), dtype=x.dtype)
        n_in = x.shape[1]
        wxy = cuda.cupy.empty_like(ls)
        cuda.elementwise(
            '''raw T x, raw T w, raw int32 ts, raw int32 paths,
            raw T codes, raw int32 begins, int32 c, int32 max_length''',
            'T ls, T wxy',
            '''
            int ind = i / max_length;
            int offset = i - ind * max_length;
            int t = ts[ind];

            int begin = begins[t];
            int length = begins[t + 1] - begins[t];

            if (offset < length) {
              int p = begin + offset;
              int node = paths[p];

              T wx = 0;
              for (int j = 0; j < c; ++j) {
                int w_ind[] = {node, j};
                int x_ind[] = {ind, j};
                wx += w[w_ind] * x[x_ind];
              }
              wxy = wx * codes[p];
              ls = log(1 + exp(-wxy));
            } else {
              ls = 0;
            }
            ''',
            'binary_hierarchical_softmax_forward'
        )(x, W, t, self.paths, self.codes, self.begins, n_in, max_length, ls,
          wxy)
        self.max_length = max_length
        self.wxy = wxy
        return ls.sum(),
Пример #11
0
    def forward(self, inputs):
        self.retain_inputs(tuple(range(len(inputs))))

        e1 = _as_mat(inputs[0])
        e2 = _as_mat(inputs[1])
        W, gy = inputs[2], inputs[-1]

        xp = cuda.get_array_module(*inputs)
        if xp is numpy:
            # optimize: gW = numpy.einsum('ij,ik,il->jkl', e1, e2, gy)
            gW = numpy.einsum('ij,ik->jki', e1, e2).dot(gy)

            gy_W = numpy.tensordot(gy, W, axes=(1, 2))  # 'il,jkl->ijk'
            # optimize: ge1 = numpy.einsum('ik,jkl,il->ij', e2, W, gy)
            ge1 = numpy.einsum('ik,ijk->ij', e2, gy_W)
            # optimize: ge2 = numpy.einsum('ij,jkl,il->ik', e1, W, gy)
            ge2 = numpy.einsum('ij,ijk->ik', e1, gy_W)
        else:
            kern = cuda.reduce('T in0, T in1, T in2', 'T out',
                               'in0 * in1 * in2', 'a + b', 'out = a', 0,
                               'bilinear_product')

            e1_b = e1[:, :, None, None]  # ij
            e2_b = e2[:, None, :, None]  # ik
            gy_b = gy[:, None, None, :]  # il
            W_b = W[None, :, :, :]  # jkl

            gW = kern(e1_b, e2_b, gy_b, axis=0)  # 'ij,ik,il->jkl'
            ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3))  # 'ik,jkl,il->ij'
            ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3))  # 'ij,jkl,il->ik'

        ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW

        if len(inputs) == 6:
            V1, V2 = inputs[3], inputs[4]
            gV1 = e1.T.dot(gy)
            gV2 = e2.T.dot(gy)
            gb = gy.sum(0)
            ge1 += gy.dot(V1.T)
            ge2 += gy.dot(V2.T)
            ret += gV1, gV2, gb

        return ret
Пример #12
0
    def forward(self, inputs):
        self.retain_inputs(tuple(range(len(inputs))))

        e1 = _as_mat(inputs[0])
        e2 = _as_mat(inputs[1])
        W, gy = inputs[2], inputs[-1]

        xp = cuda.get_array_module(*inputs)
        if xp is numpy:
            # optimize: gW = numpy.einsum('ij,ik,il->jkl', e1, e2, gy)
            gW = numpy.einsum('ij,ik->jki', e1, e2).dot(gy)

            gy_W = numpy.tensordot(gy, W, axes=(1, 2))  # 'il,jkl->ijk'
            # optimize: ge1 = numpy.einsum('ik,jkl,il->ij', e2, W, gy)
            ge1 = numpy.einsum('ik,ijk->ij', e2, gy_W)
            # optimize: ge2 = numpy.einsum('ij,jkl,il->ik', e1, W, gy)
            ge2 = numpy.einsum('ij,ijk->ik', e1, gy_W)
        else:
            kern = cuda.reduce('T in0, T in1, T in2', 'T out',
                               'in0 * in1 * in2', 'a + b', 'out = a', 0,
                               'bilinear_product')

            e1_b = e1[:, :, None, None]  # ij
            e2_b = e2[:, None, :, None]  # ik
            gy_b = gy[:, None, None, :]  # il
            W_b = W[None, :, :, :]  # jkl

            gW = kern(e1_b, e2_b, gy_b, axis=0)  # 'ij,ik,il->jkl'
            ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3))  # 'ik,jkl,il->ij'
            ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3))  # 'ij,jkl,il->ik'

        ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW

        if len(inputs) == 6:
            V1, V2 = inputs[3], inputs[4]
            gV1 = e1.T.dot(gy)
            gV2 = e2.T.dot(gy)
            gb = gy.sum(0)
            ge1 += gy.dot(V1.T)
            ge2 += gy.dot(V2.T)
            ret += gV1, gV2, gb

        return ret
Пример #13
0
    def forward(self, inputs):
        self.retain_inputs((0, 1, 2))
        x_hat, gamma, gy = inputs

        batch_size, channels = gy.shape[:2]
        gy = gy.reshape((batch_size, channels, -1))

        reduced_shape = x_hat.shape
        x_hat = x_hat.reshape((batch_size, channels, -1))

        gx_hat = gy * gamma[:, None]
        gbeta = gy.sum(axis=(0, 2))
        if backend.get_array_module(x_hat) is cuda.cupy:
            ggamma = cuda.reduce('T gy, T x_hat', 'T ggamma', 'gy * x_hat',
                                 'a + b', 'ggamma = a', '0',
                                 'groupnorm_ggamma')(gy, x_hat, axis=(0, 2))
        else:
            ggamma = (gy * x_hat).sum(axis=(0, 2))

        gx_hat = gx_hat.reshape(reduced_shape)
        return gx_hat, ggamma, gbeta
Пример #14
0
    def forward(self, inputs):
        self.retain_inputs((0, 1, 2))
        x_hat, gamma, gy = inputs

        batch_size, channels = gy.shape[:2]
        gy = gy.reshape((batch_size, channels, -1))

        reduced_shape = x_hat.shape
        x_hat = x_hat.reshape((batch_size, channels, -1))

        gx_hat = gy * gamma[:, None]
        gbeta = gy.sum(axis=(0, 2))
        if backend.get_array_module(x_hat) is cuda.cupy:
            ggamma = cuda.reduce(
                'T gy, T x_hat', 'T ggamma',
                'gy * x_hat', 'a + b', 'ggamma = a', '0',
                'groupnorm_ggamma')(gy, x_hat, axis=(0, 2))
        else:
            ggamma = (gy * x_hat).sum(axis=(0, 2))

        gx_hat = gx_hat.reshape(reduced_shape)
        return gx_hat, ggamma, gbeta
Пример #15
0
    def update_core_gpu(self, param):
        grad = param.grad
        if grad is None:
            return

        if _ObserveZeroRule._kernel is None:
            _ObserveZeroRule._kernel = cuda.elementwise(
                'T grad, T lr, T momentum', 'T param, T v, T u',
                '''u = lr * grad;
                   v = momentum * v - u;
                   param += v;''', 'momentum_sgd')
        if _ObserveZeroRule._nzu_kernel is None:
            _ObserveZeroRule._nzu_kernel = cuda.reduce('T grad, T u',
                                                       'int32 n',
                                                       'grad != 0 & u == 0',
                                                       'a + b', 'n = a', '0',
                                                       'nzu')

        #pylint: disable=not-callable
        _ObserveZeroRule._kernel(grad, self.hyperparam.lr,
                                 self.hyperparam.momentum, param.data,
                                 self.state['v'], self.state['u'])
        self.state['nzu'] = _ObserveZeroRule._nzu_kernel(grad, self.state['u'])
Пример #16
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs[:2]
        log_y = super(AdaptiveSoftmaxCrossEntropy, self).forward(inputs)[0]
        self.y = cupy.exp(log_y)

        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label',
                'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1],
              self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out',
                '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''',
                'softmax_crossent_no_reduce_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
Пример #17
0
def numerical_grad(f,
                   inputs,
                   grad_outputs,
                   eps=1e-3,
                   detect_nondifferentiable=False,
                   diff_atol=0,
                   diff_rtol=1e-2,
                   center_outputs=None):
    """Computes numerical gradient by finite differences.

    This function is used to implement gradient check. For usage example, see
    unit tests of :mod:`chainer.functions`.

    By default, ``numerical_grad`` computes the gradient to the first order of
    ``eps``.

    Args:
        f (callable): Python function with no arguments that runs forward
            computation and returns the result.
        inputs (tuple of arrays): Tuple of arrays that should be treated as
            inputs. Each element of them is slightly modified to realize
            numerical gradient by finite differences.
        grad_outputs (tuple of arrays or scalars): Tuple of arrays or scalars
            that are treated as output gradients.
        eps (float): Epsilon value of finite differences.
        detect_nondifferentiable (bool):
            ``False`` by default.
            If ``True``, ``numerical_grad`` checks whether ``f`` is
            differentiable at ``inputs``.
            It requires evaluation of ``f`` at 5 points instead of 2.
            As a side effect, the accuracy of numerical gradient will be
            increased to the third order of ``eps``.
            If it turns out that ``f`` is non-differentiable at ``input``,
            ``numerical_grad`` raises
            :class:`~chainer.gradient_check.NondifferentiableError`.
        diff_atol (float):
            Absolute tolerance of fitting error of non-differentiable point
            detection.
        diff_rtol (float):
            Tolerance of fitting error of non-differentiable point detection
            relative to the output values of ``f``.
        center_outputs (tuple of arrays or None):
            Only used if ``detect_nondifferentiable`` is ``True``.
            If specified, these arrays are used as the outputs of ``f`` at
            ``inputs``.
            Otherwise, it is calculated.
            It can be used to reduce the computation if these arrays are
            already calculated before calling ``numerical_grad``.

    Returns:
        tuple: Numerical gradient arrays corresponding to ``inputs``.

    """
    # TODO(niboshi): Deprecate `center_outputs` argument.
    # If dtype of this argument is not float64, often the resolution is
    # insufficient for numerical gradient calculation. We might use it only
    # when its dtype is float64, but it would be better to simply remove it.
    center_outputs = None

    assert eps > 0
    assert isinstance(inputs, (tuple, list))
    for x in inputs:
        if x.dtype.kind != 'f':
            raise RuntimeError(
                'The dtype of input arrays must be kind of float')

    inputs = tuple(inputs)
    # Cast grad_outputs to float64
    grad_outputs = tuple([
        None if g is None else
        numpy.float64(g) if numpy.isscalar(g) else g.astype(numpy.float64)
        for g in grad_outputs
    ])

    if not chainer.is_arrays_compatible(
        [a for a in inputs + grad_outputs if not numpy.isscalar(a)]):
        raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`')

    device = backend.get_device_from_array(*(inputs + grad_outputs))
    xp = device.xp

    if xp is cuda.cupy:
        numerical_grad_kernel_1 = cuda.reduce('T y1, T y2, U gy, T eps',
                                              'V gxi', '(y1 - y2) * gy',
                                              'a + b', 'gxi += a / (eps * 2)',
                                              '0', 'numerical_grad_kernel_1')
        numerical_grad_kernel_3 = cuda.reduce(
            'T y1, T y2, T y3, T y4, U gy, T eps', 'V gxi',
            '(-y1 + 8 * y2 - 8 * y3 + y4) * gy', 'a + b',
            'gxi += a / (eps * 6)', '0', 'numerical_grad_kernel_3')

    if xp is chainerx:
        grads = [
            xp.zeros(x.shape, numpy.float64, device=x.device) for x in inputs
        ]
    else:
        grads = [xp.zeros(x.shape, numpy.float64) for x in inputs]

    if detect_nondifferentiable:
        if center_outputs is None:
            ys0 = _copy_arrays(f())
        else:
            ys0 = center_outputs
        nout = len(ys0)
        shapes = [y0.shape for y0 in ys0]
        sizes = numpy.array([y0.size for y0 in ys0])
        cumsizes = numpy.cumsum(sizes)

    # Evaluate func at a single input
    def eval_func(x, x_ind, delta, orig):
        x[x_ind] = orig + delta
        ys = _copy_arrays(f())
        assert len(ys) == len(grad_outputs)
        assert all(
            [gy is None for y, gy in zip(ys, grad_outputs) if y is None])
        assert all([
            gy is None or numpy.isscalar(gy) or y.shape == gy.shape
            for y, gy in zip(ys, grad_outputs)
        ])
        x[x_ind] = orig
        return ys

    # An iteration on a single input displacement
    def iterate_single_input(i_in, x, orig_x, x_ind):
        orig = orig_x[x_ind]
        # `yss` holds a list of output arrays for each of 2 or 5 sampling
        # points.
        if detect_nondifferentiable:
            yss = [
                eval_func(x, x_ind, -eps * 1., orig),
                eval_func(x, x_ind, -eps * .5, orig),
                ys0,
                eval_func(x, x_ind, +eps * .5, orig),
                eval_func(x, x_ind, +eps * 1., orig),
            ]
        else:
            yss = [
                eval_func(x, x_ind, -eps * 1, orig),
                eval_func(x, x_ind, +eps * 1, orig),
            ]

        assert all([
            y is None
            or (y.shape == yss[0][i].shape and y.dtype == yss[0][i].dtype)
            for ys in yss for i, y in enumerate(ys)
        ])

        # If all the outputs are 0-size, skip non-differentiable check.
        if all([y is None or y.size == 0 for y in yss[0]]):
            detect_nondifferentiable_ = False
        else:
            detect_nondifferentiable_ = detect_nondifferentiable

        if detect_nondifferentiable_:
            # Detect non-differentiable point by quadratic fitting

            # Check for non-finite output.
            # If any single element in the output arrays has different
            # finiteness among sampled points, that means this is a
            # non-differentiable point.
            # If the function consistently generates non-finite values
            # around the point, we do not treat the point as
            # non-differentiable.
            # (Example: x<0 region for the logarithm function)
            any_nonfinite = False
            for i_out in range(nout):
                isfinites = [xp.isfinite(ys[i_out]) for ys in yss]
                if any((isfinites[0] != isfinites[i]).any()
                       for i in range(1, len(yss))):
                    s = six.StringIO()
                    s.write('Tried to compute the numeric gradient on a '
                            'non-differentiable point.\n\n')
                    s.write('i_in: {}\n'.format(i_in))
                    s.write('i_out: {}\n'.format(i_out))
                    s.write('x: {}\n'.format(inputs[i_in]))
                    s.write('index on x: {}\n'.format(x_ind))
                    s.write('eps: {}\n'.format(eps))
                    s.write('y[x-eps  ]: {}\n'.format(yss[0][i_out]))
                    s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out]))
                    s.write('y[x      ]: {}\n'.format(yss[2][i_out]))
                    s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out]))
                    s.write('y[x+eps  ]: {}\n'.format(yss[4][i_out]))
                    raise NondifferentiableError(s.getvalue())

                any_nonfinite |= not all((_).all() for _ in isfinites)

            if not any_nonfinite:
                # Stack flattened outputs to make (5, *)-shaped 2D array
                ystack = xp.vstack(
                    [xp.hstack([y.ravel() for y in ys]) for ys in yss])
                assert ystack.ndim == 2 and ystack.shape[0] == len(yss)
                # Fit to quadratic
                if xp is not numpy:
                    ystack = _cpu._to_cpu(ystack)
                polyfit = numpy.polynomial.polynomial.polyfit
                _, (residuals, _, _, _) = polyfit(range(len(yss)),
                                                  ystack,
                                                  deg=2,
                                                  full=True)
                if xp is not numpy:
                    residuals = device.send(residuals)
                residuals = xp.sqrt(residuals / len(yss))

                # Check for error for each output array
                for i_out in range(nout):
                    size = sizes[i_out]
                    cumsize = cumsizes[i_out]
                    shape = shapes[i_out]
                    # TODO(niboshi): The following two lines could be
                    # rewritten using xp.stack, which is supported in
                    # NumPy>=1.10
                    ymax = xp.concatenate([ys[i_out][None]
                                           for ys in yss]).max(axis=0)
                    ymin = xp.concatenate([ys[i_out][None]
                                           for ys in yss]).min(axis=0)
                    # Restore the shape of flattened residual
                    res = residuals[cumsize - size:cumsize]
                    res = res.reshape(shape)
                    det = utils.force_array(diff_atol + diff_rtol *
                                            (ymax - ymin) < res)
                    # Constant output = not nondifferentiable
                    det[ymax == ymin] = False
                    if det.any():
                        s = six.StringIO()
                        s.write('Tried to compute the numeric gradient on a '
                                'non-differentiable point.\n\n')
                        s.write('i_in: {}\n'.format(i_in))
                        s.write('i_out: {}\n'.format(i_out))
                        s.write('x: {}\n'.format(inputs[i_in]))
                        s.write('index on x: {}\n'.format(x_ind))
                        s.write('eps: {}\n'.format(eps))
                        s.write('diff_rtol: {}\n'.format(diff_rtol))
                        s.write('diff_atol: {}\n'.format(diff_atol))
                        s.write('ymax: {}\n'.format(ymax))
                        s.write('ymin: {}\n'.format(ymin))
                        s.write(
                            'diff_atol + diff_rtol * (ymax-ymin): {}\n'.format(
                                diff_atol + diff_rtol * (ymax - ymin)))
                        s.write('fitting errors: {}\n'.format(res))
                        s.write('y[x-eps  ]: {}\n'.format(yss[0][i_out]))
                        s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out]))
                        s.write('y[x      ]: {}\n'.format(yss[2][i_out]))
                        s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out]))
                        s.write('y[x+eps  ]: {}\n'.format(yss[4][i_out]))
                        raise NondifferentiableError(s.getvalue())

        # Calculate numerical gradient
        for i_out, gy in enumerate(grad_outputs):
            if gy is None:
                continue
            if not numpy.isscalar(gy):
                gy = gy.astype(numpy.float64, copy=False)
            gpu_ = (xp is cuda.cupy
                    and all(isinstance(ys[i_out], cuda.ndarray) for ys in yss))
            # If any output sample is None, all others must be.
            assert all([(yss[0][i_out] is None) == (yss[j][i_out] is None)
                        for j in range(len(yss))])
            # If outputs samples are None, the part of numeric gradient for
            # this output is considered as zero: skip the accumulation.
            if yss[0][i_out] is None:
                continue

            if len(yss) == 2:  # 1st order
                y0 = yss[0][i_out]
                y1 = yss[1][i_out]
                if gpu_:
                    numerical_grad_kernel_1(y1, y0, xp.asarray(gy), eps,
                                            gx[x_ind])
                else:
                    dot = ((y1 - y0) * gy).sum()
                    gx[x_ind] = gx[x_ind] + dot / (2 * eps)
            elif len(yss) == 5:  # 3rd order
                y0 = yss[0][i_out]
                y1 = yss[1][i_out]
                y2 = yss[3][i_out]
                y3 = yss[4][i_out]
                if gpu_:
                    numerical_grad_kernel_3(y3, y2, y1, y0, gy, eps, gx[x_ind])
                else:
                    num = -y3 + 8 * y2 - 8 * y1 + y0
                    dot = (num * gy).sum()
                    gx[x_ind] = gx[x_ind] + dot / (6 * eps)
            else:
                assert False

    # Calculate numeric gradient
    with configuration.using_config('type_check', False):
        for i_in, (x, gx) in enumerate(six.moves.zip(inputs, grads)):
            orig_x = x.copy()  # hold original value
            for x_ind in numpy.ndindex(x.shape):
                iterate_single_input(i_in, x, orig_x, x_ind)

    return [
        g.astype(x.dtype, copy=False) for g, x in six.moves.zip(grads, inputs)
    ]
Пример #18
0
 def normalize(arr, axis):
     norm = cuda.reduce('T x', 'T out', 'x * x', 'a + b', 'out = sqrt(a)',
                        0, 'norm_conv')(arr, axis=axis, keepdims=True)
     cuda.elementwise('T norm', 'T x', 'x /= (norm + 1e-20)',
                      'div_conv_norm')(norm, arr)
     return norm
Пример #19
0
def numerical_grad(
        f, inputs, grad_outputs, eps=1e-3,
        detect_nondifferentiable=False, diff_atol=0, diff_rtol=1e-2,
        center_outputs=None):
    """Computes numerical gradient by finite differences.

    This function is used to implement gradient check. For usage example, see
    unit tests of :mod:`chainer.functions`.

    By default, ``numerical_grad`` computes the gradient to the first order of
    ``eps``.

    Args:
        f (callable): Python function with no arguments that runs forward
            computation and returns the result.
        inputs (tuple of arrays): Tuple of arrays that should be treated as
            inputs. Each element of them is slightly modified to realize
            numerical gradient by finite differences.
        grad_outputs (tuple of arrays or scalars): Tuple of arrays or scalars
            that are treated as output gradients.
        eps (float): Epsilon value of finite differences.
        detect_nondifferentiable (bool):
            ``False`` by default.
            If ``True``, ``numerical_grad`` checks whether ``f`` is
            differentiable at ``inputs``.
            It requires evaluation of ``f`` at 5 points instead of 2.
            As a side effect, the accuracy of numerical gradient will be
            increased to the third order of ``eps``.
            If it turns out that ``f`` is non-differentiable at ``input``,
            ``numerical_grad`` raises
            :class:`~chainer.gradient_check.NondifferentiableError`.
        diff_atol (float):
            Absolute tolerance of fitting error of non-differentiable point
            detection.
        diff_rtol (float):
            Tolerance of fitting error of non-differentiable point detection
            relative to the output values of ``f``.
        center_outputs (tuple of arrays or None):
            Only used if ``detect_nondifferentiable`` is ``True``.
            If specified, these arrays are used as the outputs of ``f`` at
            ``inputs``.
            Otherwise, it is calculated.
            It can be used to reduce the computation if these arrays are
            already calculated before calling ``numerical_grad``.

    Returns:
        tuple: Numerical gradient arrays corresponding to ``inputs``.

    """
    # TODO(niboshi): Deprecate `center_outputs` argument.
    # If dtype of this argument is not float64, often the resolution is
    # insufficient for numerical gradient calculation. We might use it only
    # when its dtype is float64, but it would be better to simply remove it.
    center_outputs = None

    assert eps > 0
    assert isinstance(inputs, (tuple, list))
    for x in inputs:
        if x.dtype.kind != 'f':
            raise RuntimeError(
                'The dtype of input arrays must be kind of float')

    inputs = tuple(inputs)
    # Cast grad_outputs to float64
    grad_outputs = tuple([
        None if g is None
        else numpy.float64(g) if numpy.isscalar(g)
        else g.astype(numpy.float64)
        for g in grad_outputs])

    if not chainer.is_arrays_compatible(
            [a for a in inputs + grad_outputs if not numpy.isscalar(a)]):
        raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`')

    device = backend.get_device_from_array(*(inputs + grad_outputs))
    xp = device.xp

    if xp is cuda.cupy:
        numerical_grad_kernel_1 = cuda.reduce(
            'T y1, T y2, U gy, T eps', 'V gxi',
            '(y1 - y2) * gy', 'a + b', 'gxi += a / (eps * 2)', '0',
            'numerical_grad_kernel_1'
        )
        numerical_grad_kernel_3 = cuda.reduce(
            'T y1, T y2, T y3, T y4, U gy, T eps', 'V gxi',
            '(-y1 + 8 * y2 - 8 * y3 + y4) * gy',
            'a + b', 'gxi += a / (eps * 6)', '0',
            'numerical_grad_kernel_3'
        )

    if xp is chainerx:
        grads = [
            xp.zeros(x.shape, numpy.float64, device=x.device) for x in inputs]
    else:
        grads = [xp.zeros(x.shape, numpy.float64) for x in inputs]

    if detect_nondifferentiable:
        if center_outputs is None:
            ys0 = _copy_arrays(f())
        else:
            ys0 = center_outputs
        nout = len(ys0)
        shapes = [_.shape for _ in ys0]
        sizes = numpy.array([_.size for _ in ys0])
        cumsizes = numpy.cumsum(sizes)

    # Evaluate func at a single input
    def eval_func(x, i, delta, orig):
        x[i] = orig + delta
        y = _copy_arrays(f())
        assert len(y) == len(grad_outputs)
        assert all([
            gy is None
            for y_, gy in zip(y, grad_outputs)
            if y_ is None])
        assert all([
            gy is None or numpy.isscalar(gy) or y_.shape == gy.shape
            for y_, gy in zip(y, grad_outputs)])
        x[i] = orig
        return y

    # An iteration on a single input displacement
    def iterate_single_input(i_in, x, orig_x, i):
        orig = orig_x[i]
        # `yss` holds a list of output arrays for each of 2 or 5 sampling
        # points.
        if detect_nondifferentiable:
            yss = [
                eval_func(x, i, -eps * 1., orig),
                eval_func(x, i, -eps * .5, orig),
                ys0,
                eval_func(x, i, +eps * .5, orig),
                eval_func(x, i, +eps * 1., orig),
            ]
        else:
            yss = [
                eval_func(x, i, -eps * 1, orig),
                eval_func(x, i, +eps * 1, orig),
            ]

        if detect_nondifferentiable:
            # Detect non-differentiable point by quadratic fitting

            # Check for non-finite output.
            # If any single element in the output arrays has different
            # finiteness among sampled points, that means this is a
            # non-differentiable point.
            # If the function consistently generates non-finite values
            # around the point, we do not treat the point as
            # non-differentiable.
            # (Example: x<0 region for the logarithm function)
            any_nonfinite = False
            for i_out in range(nout):
                isfinites = [xp.isfinite(ys[i_out]) for ys in yss]
                if any((isfinites[0] != isfinites[i]).any()
                       for i in range(1, len(yss))):
                    s = six.StringIO()
                    s.write(
                        'Tried to compute the numeric gradient on a '
                        'non-differentiable point.\n\n')
                    s.write('i_in: {}\n'.format(i_in))
                    s.write('i_out: {}\n'.format(i_out))
                    s.write('x: {}\n'.format(inputs[i_in]))
                    s.write('index on x: {}\n'.format(i))
                    s.write('eps: {}\n'.format(eps))
                    s.write('y[x-eps  ]: {}\n'.format(yss[0][i_out]))
                    s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out]))
                    s.write('y[x      ]: {}\n'.format(yss[2][i_out]))
                    s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out]))
                    s.write('y[x+eps  ]: {}\n'.format(yss[4][i_out]))
                    raise NondifferentiableError(s.getvalue())

                any_nonfinite |= not all((_).all() for _ in isfinites)

            if not any_nonfinite:
                # Stack flattened outputs to make (5, *)-shaped 2D array
                ystack = xp.vstack(
                    [xp.hstack([y.ravel() for y in ys]) for ys in yss])
                assert ystack.ndim == 2 and ystack.shape[0] == len(yss)
                # Fit to quadratic
                if xp is not numpy:
                    ystack = _cpu._to_cpu(ystack)
                polyfit = numpy.polynomial.polynomial.polyfit
                _, (residuals, _, _, _) = polyfit(
                    range(len(yss)), ystack, deg=2, full=True)
                if xp is not numpy:
                    residuals = device.send(residuals)
                residuals = xp.sqrt(residuals / len(yss))

                # Check for error for each output array
                for i_out in range(nout):
                    size = sizes[i_out]
                    cumsize = cumsizes[i_out]
                    shape = shapes[i_out]
                    # TODO(niboshi): The following two lines could be
                    # rewritten using xp.stack, which is supported in
                    # NumPy>=1.10
                    ymax = xp.concatenate(
                        [ys[i_out][None] for ys in yss]).max(axis=0)
                    ymin = xp.concatenate(
                        [ys[i_out][None] for ys in yss]).min(axis=0)
                    # Restore the shape of flattened residual
                    res = residuals[cumsize - size:cumsize]
                    res = res.reshape(shape)
                    det = utils.force_array(
                        diff_atol + diff_rtol * (ymax - ymin) < res)
                    # Constant output = not nondifferentiable
                    det[ymax == ymin] = False
                    if det.any():
                        s = six.StringIO()
                        s.write(
                            'Tried to compute the numeric gradient on a '
                            'non-differentiable point.\n\n')
                        s.write('i_in: {}\n'.format(i_in))
                        s.write('i_out: {}\n'.format(i_out))
                        s.write('x: {}\n'.format(inputs[i_in]))
                        s.write('index on x: {}\n'.format(i))
                        s.write('eps: {}\n'.format(eps))
                        s.write('diff_rtol: {}\n'.format(diff_rtol))
                        s.write('diff_atol: {}\n'.format(diff_atol))
                        s.write('ymax: {}\n'.format(ymax))
                        s.write('ymin: {}\n'.format(ymin))
                        s.write(
                            'diff_atol + diff_rtol * (ymax-ymin): {}\n'.format(
                                diff_atol + diff_rtol * (ymax - ymin)))
                        s.write('fitting errors: {}\n'.format(res))
                        s.write('y[x-eps  ]: {}\n'.format(yss[0][i_out]))
                        s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out]))
                        s.write('y[x      ]: {}\n'.format(yss[2][i_out]))
                        s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out]))
                        s.write('y[x+eps  ]: {}\n'.format(yss[4][i_out]))
                        raise NondifferentiableError(s.getvalue())

        # Calculate numerical gradient
        for i_out, gy in enumerate(grad_outputs):
            if gy is None:
                continue
            if not numpy.isscalar(gy):
                gy = gy.astype(numpy.float64, copy=False)
            gpu_ = (xp is cuda.cupy and
                    all(isinstance(ys[i_out], cuda.ndarray)
                        for ys in yss))
            # If any output sample is None, all others must be.
            assert all([
                (yss[0][i_out] is None) == (yss[j][i_out] is None)
                for j in range(len(yss))])
            # If outputs samples are None, the part of numeric gradient for
            # this output is considered as zero: skip the accumulation.
            if yss[0][i_out] is None:
                continue

            if len(yss) == 2:  # 1st order
                y0 = yss[0][i_out]
                y1 = yss[1][i_out]
                if gpu_:
                    numerical_grad_kernel_1(
                        y1, y0, xp.asarray(gy), eps, gx[i])
                else:
                    dot = ((y1 - y0) * gy).sum()
                    gx[i] = gx[i] + dot / (2 * eps)
            elif len(yss) == 5:  # 3rd order
                y0 = yss[0][i_out]
                y1 = yss[1][i_out]
                y2 = yss[3][i_out]
                y3 = yss[4][i_out]
                if gpu_:
                    numerical_grad_kernel_3(
                        y3, y2, y1, y0, gy, eps, gx[i])
                else:
                    num = -y3 + 8 * y2 - 8 * y1 + y0
                    dot = (num * gy).sum()
                    gx[i] = gx[i] + dot / (6 * eps)
            else:
                assert False

    # Calculate numeric gradient
    with configuration.using_config('type_check', False):
        for i_in, (x, gx) in enumerate(six.moves.zip(inputs, grads)):
            orig_x = x.copy()  # hold original value
            for i in numpy.ndindex(x.shape):
                iterate_single_input(i_in, x, orig_x, i)

    return [g.astype(x.dtype, copy=False)
            for g, x in six.moves.zip(grads, inputs)]
Пример #20
0
def numerical_grad(f,
                   inputs,
                   grad_outputs,
                   eps=1e-3,
                   detect_nondifferentiable=False,
                   diff_atol=0,
                   diff_rtol=1e-2,
                   center_outputs=None):
    """Computes numerical gradient by finite differences.

    This function is used to implement gradient check. For usage example, see
    unit tests of :mod:`chainer.functions`.

    By default, ``numerical_grad`` computes the gradient to the first order of
    ``eps``.

    Args:
        f (function): Python function with no arguments that runs forward
            computation and returns the result.
        inputs (tuple of arrays): Tuple of arrays that should be treated as
            inputs. Each element of them is slightly modified to realize
            numerical gradient by finite differences.
        grad_outputs (tuple of arrays): Tuple of arrays that are treated as
            output gradients.
        eps (float): Epsilon value of finite differences.
        detect_nondifferentiable (bool):
            ``False`` by default.
            If ``True``, ``numerical_grad`` checks whether ``f`` is
            differentiable at ``inputs``.
            It requires evaluation of ``f`` at 5 points instead of 2.
            As a side effect, the accuracy of numerical gradient will be
            increased to the third order of ``eps``.
            If it turns out that ``f`` is non-differentiable at ``input``,
            ``numerical_grad`` raises
            :class:`~chainer.gradient_check.NondifferentiableError`.
        diff_atol (float):
            Absolute tolerance of fitting error of non-differentiable point
            detection.
        diff_rtol (float):
            Tolerance of fitting error of non-differentiable point detection
            relative to the output values of ``f``.
        center_outputs (tuple of arrays or None):
            Only used if ``detect_nondifferentiable`` is ``True``.
            If specified, these arrays are used as the outputs of ``f`` at
            ``inputs``.
            Otherwise, it is calculated.
            It can be used to reduce the computation if these arrays are
            already calculated before calling ``numerical_grad``.

    Returns:
        tuple: Numerical gradient arrays corresponding to ``inputs``.

    """
    assert eps > 0
    for x in inputs:
        if x.dtype.kind != 'f':
            raise RuntimeError(
                'The dtype of input arrays must be kind of float')

    inputs = tuple(inputs)
    grad_outputs = tuple(grad_outputs)
    gpu = any(isinstance(x, cuda.ndarray) for x in inputs + grad_outputs)
    cpu = any(isinstance(x, numpy.ndarray) for x in inputs + grad_outputs)

    if gpu and cpu:
        raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`')

    if gpu:
        xp = cuda.cupy
        numerical_grad_kernel_1 = cuda.reduce('T y1, T y2, U gy, T eps',
                                              'V gxi', '(y1 - y2) * gy',
                                              'a + b', 'gxi += a / (eps * 2)',
                                              '0', 'numerical_grad_kernel_1')
        numerical_grad_kernel_3 = cuda.reduce(
            'T y1, T y2, T y3, T y4, U gy, T eps', 'V gxi',
            '(-y1 + 8 * y2 - 8 * y3 + y4) * gy', 'a + b',
            'gxi += a / (eps * 6)', '0', 'numerical_grad_kernel_3')
    else:
        xp = numpy
    grads = [xp.zeros(x.shape, numpy.float64) for x in inputs]

    if detect_nondifferentiable:
        if center_outputs is None:
            ys0 = _copy_arrays(f())
        else:
            ys0 = center_outputs
        nout = len(ys0)
        shapes = [_.shape for _ in ys0]
        sizes = numpy.array([_.size for _ in ys0])
        cumsizes = numpy.cumsum(sizes)

    # Evaluate func at a single input
    def eval_func(x, i, delta, orig):
        x[i] = orig + delta
        y = _copy_arrays(f())
        x[i] = orig
        return y

    # An iteration on a single input displacement
    def iterate_single_input(i_in, x, orig_x, i):
        orig = orig_x[i]
        # `yss` holds a list of output arrays for each of 2 or 5 sampling
        # points.
        if detect_nondifferentiable:
            yss = [
                eval_func(x, i, -eps * 1., orig),
                eval_func(x, i, -eps * .5, orig),
                ys0,
                eval_func(x, i, +eps * .5, orig),
                eval_func(x, i, +eps * 1., orig),
            ]
        else:
            yss = [
                eval_func(x, i, -eps * 1, orig),
                eval_func(x, i, +eps * 1, orig),
            ]

        if detect_nondifferentiable:
            # Detect non-differentiable point by quadratic fitting

            # Check for non-finite output.
            # If any single element in the output arrays has different
            # finiteness among sampled points, that means this is a
            # non-differentiable point.
            # If the function consistently generates non-finite values
            # around the point, we do not treat the point as
            # non-differentiable.
            # (Example: x<0 region for the logarithm function)
            any_nonfinite = False
            for i_out in range(nout):
                isfinites = [xp.isfinite(ys[i_out]) for ys in yss]
                if any((isfinites[0] != isfinites[i]).any()
                       for i in range(1, len(yss))):
                    s = six.StringIO()
                    s.write('Tried to compute the numeric gradient on a '
                            'non-differentiable point.\n\n')
                    s.write('i_in: {}\n'.format(i_in))
                    s.write('i_out: {}\n'.format(i_out))
                    s.write('x: {}\n'.format(inputs[i_in]))
                    s.write('index on x: {}\n'.format(i))
                    s.write('eps: {}\n'.format(eps))
                    s.write('y[x-eps  ]: {}\n'.format(yss[0][i_out]))
                    s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out]))
                    s.write('y[x      ]: {}\n'.format(yss[2][i_out]))
                    s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out]))
                    s.write('y[x+eps  ]: {}\n'.format(yss[4][i_out]))
                    raise NondifferentiableError(s.getvalue())

                any_nonfinite |= not all(_.all() for _ in isfinites)

            if not any_nonfinite:
                # Stack flattenend outputs to make (5, *)-shaped 2D array
                ystack = xp.vstack(
                    [xp.hstack([y.ravel() for y in ys]) for ys in yss])
                assert ystack.ndim == 2 and ystack.shape[0] == len(yss)
                # Fit to quadratic
                if gpu:
                    ystack = ystack.get()
                polyfit = numpy.polynomial.polynomial.polyfit
                _, (residuals, _, _, _) = polyfit(range(len(yss)),
                                                  ystack,
                                                  deg=2,
                                                  full=True)
                if gpu:
                    residuals = xp.array(residuals)
                residuals = xp.sqrt(residuals / len(yss))

                # Check for error for each output array
                for i_out in range(nout):
                    size = sizes[i_out]
                    cumsize = cumsizes[i_out]
                    shape = shapes[i_out]
                    ymax = xp.stack([ys[i_out] for ys in yss]).max(axis=0)
                    ymin = xp.stack([ys[i_out] for ys in yss]).min(axis=0)
                    # Restore the shape of flattened residual
                    res = residuals[cumsize - size:cumsize]
                    res = res.reshape(shape)
                    det = xp.asarray(diff_atol + diff_rtol *
                                     (ymax - ymin) < res)
                    # Constant output = not nondifferentiable
                    det[ymax == ymin] = False
                    if det.any():
                        s = six.StringIO()
                        s.write('Tried to compute the numeric gradient on a '
                                'non-differentiable point.\n\n')
                        s.write('i_in: {}\n'.format(i_in))
                        s.write('i_out: {}\n'.format(i_out))
                        s.write('x: {}\n'.format(inputs[i_in]))
                        s.write('index on x: {}\n'.format(i))
                        s.write('eps: {}\n'.format(eps))
                        s.write('diff_rtol: {}\n'.format(diff_rtol))
                        s.write('diff_atol: {}\n'.format(diff_atol))
                        s.write('ymax: {}\n'.format(ymax))
                        s.write('ymin: {}\n'.format(ymin))
                        s.write(
                            'diff_atol + diff_rtol * (ymax-ymin): {}\n'.format(
                                diff_atol + diff_rtol * (ymax - ymin)))
                        s.write('fitting errors: {}\n'.format(res))
                        s.write('y[x-eps  ]: {}\n'.format(yss[0][i_out]))
                        s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out]))
                        s.write('y[x      ]: {}\n'.format(yss[2][i_out]))
                        s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out]))
                        s.write('y[x+eps  ]: {}\n'.format(yss[4][i_out]))
                        raise NondifferentiableError(s.getvalue())

        # Calculate numerical gradient
        for i_out, gy in enumerate(grad_outputs):
            if gy is None:
                continue
            gpu_ = (gpu
                    and all(isinstance(ys[i_out], cuda.ndarray) for ys in yss))
            if len(yss) == 2:  # 1st order
                y0 = yss[0][i_out]
                y1 = yss[1][i_out]
                if gpu_:
                    numerical_grad_kernel_1(y1, y0, xp.asarray(gy), eps, gx[i])
                else:
                    dot = ((y1 - y0) * gy).sum()
                    gx[i] += dot / (2 * eps)
            elif len(yss) == 5:  # 3rd order
                y0 = yss[0][i_out]
                y1 = yss[1][i_out]
                y2 = yss[3][i_out]
                y3 = yss[4][i_out]
                if gpu_:
                    numerical_grad_kernel_3(y3, y2, y1, y0, gy, eps, gx[i])
                else:
                    num = -y3 + 8 * y2 - 8 * y1 + y0
                    dot = (num * gy).sum()
                    gx[i] += dot / (6 * eps)
            else:
                assert False

    # Calculate numeric gradient
    with configuration.using_config('type_check', False):
        for i_in, (x, gx) in enumerate(six.moves.zip(inputs, grads)):
            orig_x = x.copy()  # hold original value
            for i in numpy.ndindex(x.shape):
                iterate_single_input(i_in, x, orig_x, i)

    return [
        g.astype(x.dtype, copy=False) for g, x in six.moves.zip(grads, inputs)
    ]