def forward_gpu(self, inputs): self.retain_inputs((0,)) x = inputs[0].reshape(len(inputs[0]), -1) l2normsquared_kernel = cuda.reduce( 'T x', 'T y', 'x * x', 'a + b', 'y = a', '0', 'l2normsquared' ) return l2normsquared_kernel(x, axis=1),
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs[:2] log_y = super(AdaptiveSoftmaxCrossEntropy, self).forward(inputs)[0] self.y = cupy.exp(log_y) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, ' 'S ignore_label', 'T out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def forward_gpu(self, inputs): class_weight = backend.from_chx(self.class_weight) self.retain_inputs((0, 1)) cupy = cuda.cupy x, t = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) if x.size == 0: y = cupy.zeros(t.shape, dtype=x.dtype) if self.cache_score: self.y = y if self.reduce == 'mean': return y.sum(), else: return y, log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = cupy.exp(log_y) if class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': # Reduction is performed in a promoted dtype reduc_dtype = _reduction_dtype(x.dtype) if self.normalize: count = (t != self.ignore_label).sum(dtype=reduc_dtype) count = cupy.maximum(1, count) coeff = 1. / count else: coeff = cupy.array(1. / max(1, len(t)), dtype=reduc_dtype) self._coeff = coeff ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw U coeff, ' 'S ignore_label', 'U out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = static_cast<U>(a * -coeff[0])', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) ret = ret.astype(log_y.dtype, copy=False) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def _logsumexp(a, xp, axis=None): vmax = xp.amax(a, axis=axis, keepdims=True) if xp is numpy: vmax += xp.log( xp.sum(xp.exp(a - vmax), axis=axis, keepdims=True, dtype=a.dtype)) else: _logsumexp_impl = cuda.reduce('T x, T vmax', 'T y', 'exp(x - vmax)', 'a + b', 'y += log(a)', '0', 'logsumexp_impl') _logsumexp_impl(a, vmax, vmax, axis=axis, keepdims=True) return xp.squeeze(vmax, axis=axis)
def _logsumexp(a, xp, axis=None): vmax = xp.amax(a, axis=axis, keepdims=True) if xp is numpy: vmax += xp.log(xp.sum(xp.exp(a - vmax), axis=axis, keepdims=True, dtype=a.dtype)) else: _logsumexp_impl = cuda.reduce( 'T x, T vmax', 'T y', 'exp(x - vmax)', 'a + b', 'y += log(a)', '0', 'logsumexp_impl') _logsumexp_impl(a, vmax, vmax, axis=axis, keepdims=True) return xp.squeeze(vmax, axis=axis)
def forward_gpu(self, inputs): class_weight = backend.from_chainerx(self.class_weight) self.retain_inputs((0, 1)) cupy = cuda.cupy x, t = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) if x.size == 0: y = cupy.zeros(t.shape, dtype=x.dtype) if self.cache_score: self.y = y if self.reduce == 'mean': return y.sum(), else: return y, log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = cupy.exp(log_y) if class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, ' 'S ignore_label', 'T out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def forward_gpu(self, inputs): cupy = cuda.cupy if len(inputs) == 2: (x, t), tt = inputs, None else: x, t, tt = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) if x.size == 0: y = cupy.zeros(t.shape, dtype=x.dtype) if self.cache_score: self.y = y if self.reduce == 'mean': return y.sum(), else: return y, log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = cupy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to(self.class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, ' 'S ignore_label', 'T out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def normalize(arr): """normalize input array and return its norm from https://github.com/pfnet-research/sngan_projection/blob/master/source/functions/max_sv.py#L5 :param arr: numpy ndarray or cupy ndarray :return: norm of input array """ norm = cuda.reduce('T x', 'T out', 'x * x', 'a + b', 'out = sqrt(a)', 0, 'norm_sn')(arr) cuda.elementwise('T norm', 'T x', 'x /= (norm + 1e-20)', 'div_sn')(norm, arr) return norm
def forward_gpu(self, inputs): x, t, W = inputs max_length = cuda.reduce( 'T t, raw T begins', 'T out', 'begins[t + 1] - begins[t]', 'max(a, b)', 'out = a', '0', 'binary_hierarchical_softmax_max_length')(t, self.begins) max_length = cuda.to_cpu(max_length)[()] length = max_length * x.shape[0] ls = cuda.cupy.empty((length,), dtype=x.dtype) n_in = x.shape[1] wxy = cuda.cupy.empty_like(ls) cuda.elementwise( '''raw T x, raw T w, raw int32 ts, raw int32 paths, raw T codes, raw int32 begins, int32 c, int32 max_length''', 'T ls, T wxy', ''' int ind = i / max_length; int offset = i - ind * max_length; int t = ts[ind]; int begin = begins[t]; int length = begins[t + 1] - begins[t]; if (offset < length) { int p = begin + offset; int node = paths[p]; T wx = 0; for (int j = 0; j < c; ++j) { int w_ind[] = {node, j}; int x_ind[] = {ind, j}; wx += w[w_ind] * x[x_ind]; } wxy = wx * codes[p]; ls = log(1 + exp(-wxy)); } else { ls = 0; } ''', 'binary_hierarchical_softmax_forward' )(x, W, t, self.paths, self.codes, self.begins, n_in, max_length, ls, wxy) self.max_length = max_length self.wxy = wxy return ls.sum(),
def forward(self, inputs): self.retain_inputs(tuple(range(len(inputs)))) e1 = _as_mat(inputs[0]) e2 = _as_mat(inputs[1]) W, gy = inputs[2], inputs[-1] xp = cuda.get_array_module(*inputs) if xp is numpy: # optimize: gW = numpy.einsum('ij,ik,il->jkl', e1, e2, gy) gW = numpy.einsum('ij,ik->jki', e1, e2).dot(gy) gy_W = numpy.tensordot(gy, W, axes=(1, 2)) # 'il,jkl->ijk' # optimize: ge1 = numpy.einsum('ik,jkl,il->ij', e2, W, gy) ge1 = numpy.einsum('ik,ijk->ij', e2, gy_W) # optimize: ge2 = numpy.einsum('ij,jkl,il->ik', e1, W, gy) ge2 = numpy.einsum('ij,ijk->ik', e1, gy_W) else: kern = cuda.reduce('T in0, T in1, T in2', 'T out', 'in0 * in1 * in2', 'a + b', 'out = a', 0, 'bilinear_product') e1_b = e1[:, :, None, None] # ij e2_b = e2[:, None, :, None] # ik gy_b = gy[:, None, None, :] # il W_b = W[None, :, :, :] # jkl gW = kern(e1_b, e2_b, gy_b, axis=0) # 'ij,ik,il->jkl' ge1 = kern(e2_b, W_b, gy_b, axis=(2, 3)) # 'ik,jkl,il->ij' ge2 = kern(e1_b, W_b, gy_b, axis=(1, 3)) # 'ij,jkl,il->ik' ret = ge1.reshape(inputs[0].shape), ge2.reshape(inputs[1].shape), gW if len(inputs) == 6: V1, V2 = inputs[3], inputs[4] gV1 = e1.T.dot(gy) gV2 = e2.T.dot(gy) gb = gy.sum(0) ge1 += gy.dot(V1.T) ge2 += gy.dot(V2.T) ret += gV1, gV2, gb return ret
def forward(self, inputs): self.retain_inputs((0, 1, 2)) x_hat, gamma, gy = inputs batch_size, channels = gy.shape[:2] gy = gy.reshape((batch_size, channels, -1)) reduced_shape = x_hat.shape x_hat = x_hat.reshape((batch_size, channels, -1)) gx_hat = gy * gamma[:, None] gbeta = gy.sum(axis=(0, 2)) if backend.get_array_module(x_hat) is cuda.cupy: ggamma = cuda.reduce('T gy, T x_hat', 'T ggamma', 'gy * x_hat', 'a + b', 'ggamma = a', '0', 'groupnorm_ggamma')(gy, x_hat, axis=(0, 2)) else: ggamma = (gy * x_hat).sum(axis=(0, 2)) gx_hat = gx_hat.reshape(reduced_shape) return gx_hat, ggamma, gbeta
def forward(self, inputs): self.retain_inputs((0, 1, 2)) x_hat, gamma, gy = inputs batch_size, channels = gy.shape[:2] gy = gy.reshape((batch_size, channels, -1)) reduced_shape = x_hat.shape x_hat = x_hat.reshape((batch_size, channels, -1)) gx_hat = gy * gamma[:, None] gbeta = gy.sum(axis=(0, 2)) if backend.get_array_module(x_hat) is cuda.cupy: ggamma = cuda.reduce( 'T gy, T x_hat', 'T ggamma', 'gy * x_hat', 'a + b', 'ggamma = a', '0', 'groupnorm_ggamma')(gy, x_hat, axis=(0, 2)) else: ggamma = (gy * x_hat).sum(axis=(0, 2)) gx_hat = gx_hat.reshape(reduced_shape) return gx_hat, ggamma, gbeta
def update_core_gpu(self, param): grad = param.grad if grad is None: return if _ObserveZeroRule._kernel is None: _ObserveZeroRule._kernel = cuda.elementwise( 'T grad, T lr, T momentum', 'T param, T v, T u', '''u = lr * grad; v = momentum * v - u; param += v;''', 'momentum_sgd') if _ObserveZeroRule._nzu_kernel is None: _ObserveZeroRule._nzu_kernel = cuda.reduce('T grad, T u', 'int32 n', 'grad != 0 & u == 0', 'a + b', 'n = a', '0', 'nzu') #pylint: disable=not-callable _ObserveZeroRule._kernel(grad, self.hyperparam.lr, self.hyperparam.momentum, param.data, self.state['v'], self.state['u']) self.state['nzu'] = _ObserveZeroRule._nzu_kernel(grad, self.state['u'])
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs[:2] log_y = super(AdaptiveSoftmaxCrossEntropy, self).forward(inputs)[0] self.y = cupy.exp(log_y) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, ' 'S ignore_label', 'T out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def numerical_grad(f, inputs, grad_outputs, eps=1e-3, detect_nondifferentiable=False, diff_atol=0, diff_rtol=1e-2, center_outputs=None): """Computes numerical gradient by finite differences. This function is used to implement gradient check. For usage example, see unit tests of :mod:`chainer.functions`. By default, ``numerical_grad`` computes the gradient to the first order of ``eps``. Args: f (callable): Python function with no arguments that runs forward computation and returns the result. inputs (tuple of arrays): Tuple of arrays that should be treated as inputs. Each element of them is slightly modified to realize numerical gradient by finite differences. grad_outputs (tuple of arrays or scalars): Tuple of arrays or scalars that are treated as output gradients. eps (float): Epsilon value of finite differences. detect_nondifferentiable (bool): ``False`` by default. If ``True``, ``numerical_grad`` checks whether ``f`` is differentiable at ``inputs``. It requires evaluation of ``f`` at 5 points instead of 2. As a side effect, the accuracy of numerical gradient will be increased to the third order of ``eps``. If it turns out that ``f`` is non-differentiable at ``input``, ``numerical_grad`` raises :class:`~chainer.gradient_check.NondifferentiableError`. diff_atol (float): Absolute tolerance of fitting error of non-differentiable point detection. diff_rtol (float): Tolerance of fitting error of non-differentiable point detection relative to the output values of ``f``. center_outputs (tuple of arrays or None): Only used if ``detect_nondifferentiable`` is ``True``. If specified, these arrays are used as the outputs of ``f`` at ``inputs``. Otherwise, it is calculated. It can be used to reduce the computation if these arrays are already calculated before calling ``numerical_grad``. Returns: tuple: Numerical gradient arrays corresponding to ``inputs``. """ # TODO(niboshi): Deprecate `center_outputs` argument. # If dtype of this argument is not float64, often the resolution is # insufficient for numerical gradient calculation. We might use it only # when its dtype is float64, but it would be better to simply remove it. center_outputs = None assert eps > 0 assert isinstance(inputs, (tuple, list)) for x in inputs: if x.dtype.kind != 'f': raise RuntimeError( 'The dtype of input arrays must be kind of float') inputs = tuple(inputs) # Cast grad_outputs to float64 grad_outputs = tuple([ None if g is None else numpy.float64(g) if numpy.isscalar(g) else g.astype(numpy.float64) for g in grad_outputs ]) if not chainer.is_arrays_compatible( [a for a in inputs + grad_outputs if not numpy.isscalar(a)]): raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`') device = backend.get_device_from_array(*(inputs + grad_outputs)) xp = device.xp if xp is cuda.cupy: numerical_grad_kernel_1 = cuda.reduce('T y1, T y2, U gy, T eps', 'V gxi', '(y1 - y2) * gy', 'a + b', 'gxi += a / (eps * 2)', '0', 'numerical_grad_kernel_1') numerical_grad_kernel_3 = cuda.reduce( 'T y1, T y2, T y3, T y4, U gy, T eps', 'V gxi', '(-y1 + 8 * y2 - 8 * y3 + y4) * gy', 'a + b', 'gxi += a / (eps * 6)', '0', 'numerical_grad_kernel_3') if xp is chainerx: grads = [ xp.zeros(x.shape, numpy.float64, device=x.device) for x in inputs ] else: grads = [xp.zeros(x.shape, numpy.float64) for x in inputs] if detect_nondifferentiable: if center_outputs is None: ys0 = _copy_arrays(f()) else: ys0 = center_outputs nout = len(ys0) shapes = [y0.shape for y0 in ys0] sizes = numpy.array([y0.size for y0 in ys0]) cumsizes = numpy.cumsum(sizes) # Evaluate func at a single input def eval_func(x, x_ind, delta, orig): x[x_ind] = orig + delta ys = _copy_arrays(f()) assert len(ys) == len(grad_outputs) assert all( [gy is None for y, gy in zip(ys, grad_outputs) if y is None]) assert all([ gy is None or numpy.isscalar(gy) or y.shape == gy.shape for y, gy in zip(ys, grad_outputs) ]) x[x_ind] = orig return ys # An iteration on a single input displacement def iterate_single_input(i_in, x, orig_x, x_ind): orig = orig_x[x_ind] # `yss` holds a list of output arrays for each of 2 or 5 sampling # points. if detect_nondifferentiable: yss = [ eval_func(x, x_ind, -eps * 1., orig), eval_func(x, x_ind, -eps * .5, orig), ys0, eval_func(x, x_ind, +eps * .5, orig), eval_func(x, x_ind, +eps * 1., orig), ] else: yss = [ eval_func(x, x_ind, -eps * 1, orig), eval_func(x, x_ind, +eps * 1, orig), ] assert all([ y is None or (y.shape == yss[0][i].shape and y.dtype == yss[0][i].dtype) for ys in yss for i, y in enumerate(ys) ]) # If all the outputs are 0-size, skip non-differentiable check. if all([y is None or y.size == 0 for y in yss[0]]): detect_nondifferentiable_ = False else: detect_nondifferentiable_ = detect_nondifferentiable if detect_nondifferentiable_: # Detect non-differentiable point by quadratic fitting # Check for non-finite output. # If any single element in the output arrays has different # finiteness among sampled points, that means this is a # non-differentiable point. # If the function consistently generates non-finite values # around the point, we do not treat the point as # non-differentiable. # (Example: x<0 region for the logarithm function) any_nonfinite = False for i_out in range(nout): isfinites = [xp.isfinite(ys[i_out]) for ys in yss] if any((isfinites[0] != isfinites[i]).any() for i in range(1, len(yss))): s = six.StringIO() s.write('Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(x_ind)) s.write('eps: {}\n'.format(eps)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) any_nonfinite |= not all((_).all() for _ in isfinites) if not any_nonfinite: # Stack flattened outputs to make (5, *)-shaped 2D array ystack = xp.vstack( [xp.hstack([y.ravel() for y in ys]) for ys in yss]) assert ystack.ndim == 2 and ystack.shape[0] == len(yss) # Fit to quadratic if xp is not numpy: ystack = _cpu._to_cpu(ystack) polyfit = numpy.polynomial.polynomial.polyfit _, (residuals, _, _, _) = polyfit(range(len(yss)), ystack, deg=2, full=True) if xp is not numpy: residuals = device.send(residuals) residuals = xp.sqrt(residuals / len(yss)) # Check for error for each output array for i_out in range(nout): size = sizes[i_out] cumsize = cumsizes[i_out] shape = shapes[i_out] # TODO(niboshi): The following two lines could be # rewritten using xp.stack, which is supported in # NumPy>=1.10 ymax = xp.concatenate([ys[i_out][None] for ys in yss]).max(axis=0) ymin = xp.concatenate([ys[i_out][None] for ys in yss]).min(axis=0) # Restore the shape of flattened residual res = residuals[cumsize - size:cumsize] res = res.reshape(shape) det = utils.force_array(diff_atol + diff_rtol * (ymax - ymin) < res) # Constant output = not nondifferentiable det[ymax == ymin] = False if det.any(): s = six.StringIO() s.write('Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(x_ind)) s.write('eps: {}\n'.format(eps)) s.write('diff_rtol: {}\n'.format(diff_rtol)) s.write('diff_atol: {}\n'.format(diff_atol)) s.write('ymax: {}\n'.format(ymax)) s.write('ymin: {}\n'.format(ymin)) s.write( 'diff_atol + diff_rtol * (ymax-ymin): {}\n'.format( diff_atol + diff_rtol * (ymax - ymin))) s.write('fitting errors: {}\n'.format(res)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) # Calculate numerical gradient for i_out, gy in enumerate(grad_outputs): if gy is None: continue if not numpy.isscalar(gy): gy = gy.astype(numpy.float64, copy=False) gpu_ = (xp is cuda.cupy and all(isinstance(ys[i_out], cuda.ndarray) for ys in yss)) # If any output sample is None, all others must be. assert all([(yss[0][i_out] is None) == (yss[j][i_out] is None) for j in range(len(yss))]) # If outputs samples are None, the part of numeric gradient for # this output is considered as zero: skip the accumulation. if yss[0][i_out] is None: continue if len(yss) == 2: # 1st order y0 = yss[0][i_out] y1 = yss[1][i_out] if gpu_: numerical_grad_kernel_1(y1, y0, xp.asarray(gy), eps, gx[x_ind]) else: dot = ((y1 - y0) * gy).sum() gx[x_ind] = gx[x_ind] + dot / (2 * eps) elif len(yss) == 5: # 3rd order y0 = yss[0][i_out] y1 = yss[1][i_out] y2 = yss[3][i_out] y3 = yss[4][i_out] if gpu_: numerical_grad_kernel_3(y3, y2, y1, y0, gy, eps, gx[x_ind]) else: num = -y3 + 8 * y2 - 8 * y1 + y0 dot = (num * gy).sum() gx[x_ind] = gx[x_ind] + dot / (6 * eps) else: assert False # Calculate numeric gradient with configuration.using_config('type_check', False): for i_in, (x, gx) in enumerate(six.moves.zip(inputs, grads)): orig_x = x.copy() # hold original value for x_ind in numpy.ndindex(x.shape): iterate_single_input(i_in, x, orig_x, x_ind) return [ g.astype(x.dtype, copy=False) for g, x in six.moves.zip(grads, inputs) ]
def normalize(arr, axis): norm = cuda.reduce('T x', 'T out', 'x * x', 'a + b', 'out = sqrt(a)', 0, 'norm_conv')(arr, axis=axis, keepdims=True) cuda.elementwise('T norm', 'T x', 'x /= (norm + 1e-20)', 'div_conv_norm')(norm, arr) return norm
def numerical_grad( f, inputs, grad_outputs, eps=1e-3, detect_nondifferentiable=False, diff_atol=0, diff_rtol=1e-2, center_outputs=None): """Computes numerical gradient by finite differences. This function is used to implement gradient check. For usage example, see unit tests of :mod:`chainer.functions`. By default, ``numerical_grad`` computes the gradient to the first order of ``eps``. Args: f (callable): Python function with no arguments that runs forward computation and returns the result. inputs (tuple of arrays): Tuple of arrays that should be treated as inputs. Each element of them is slightly modified to realize numerical gradient by finite differences. grad_outputs (tuple of arrays or scalars): Tuple of arrays or scalars that are treated as output gradients. eps (float): Epsilon value of finite differences. detect_nondifferentiable (bool): ``False`` by default. If ``True``, ``numerical_grad`` checks whether ``f`` is differentiable at ``inputs``. It requires evaluation of ``f`` at 5 points instead of 2. As a side effect, the accuracy of numerical gradient will be increased to the third order of ``eps``. If it turns out that ``f`` is non-differentiable at ``input``, ``numerical_grad`` raises :class:`~chainer.gradient_check.NondifferentiableError`. diff_atol (float): Absolute tolerance of fitting error of non-differentiable point detection. diff_rtol (float): Tolerance of fitting error of non-differentiable point detection relative to the output values of ``f``. center_outputs (tuple of arrays or None): Only used if ``detect_nondifferentiable`` is ``True``. If specified, these arrays are used as the outputs of ``f`` at ``inputs``. Otherwise, it is calculated. It can be used to reduce the computation if these arrays are already calculated before calling ``numerical_grad``. Returns: tuple: Numerical gradient arrays corresponding to ``inputs``. """ # TODO(niboshi): Deprecate `center_outputs` argument. # If dtype of this argument is not float64, often the resolution is # insufficient for numerical gradient calculation. We might use it only # when its dtype is float64, but it would be better to simply remove it. center_outputs = None assert eps > 0 assert isinstance(inputs, (tuple, list)) for x in inputs: if x.dtype.kind != 'f': raise RuntimeError( 'The dtype of input arrays must be kind of float') inputs = tuple(inputs) # Cast grad_outputs to float64 grad_outputs = tuple([ None if g is None else numpy.float64(g) if numpy.isscalar(g) else g.astype(numpy.float64) for g in grad_outputs]) if not chainer.is_arrays_compatible( [a for a in inputs + grad_outputs if not numpy.isscalar(a)]): raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`') device = backend.get_device_from_array(*(inputs + grad_outputs)) xp = device.xp if xp is cuda.cupy: numerical_grad_kernel_1 = cuda.reduce( 'T y1, T y2, U gy, T eps', 'V gxi', '(y1 - y2) * gy', 'a + b', 'gxi += a / (eps * 2)', '0', 'numerical_grad_kernel_1' ) numerical_grad_kernel_3 = cuda.reduce( 'T y1, T y2, T y3, T y4, U gy, T eps', 'V gxi', '(-y1 + 8 * y2 - 8 * y3 + y4) * gy', 'a + b', 'gxi += a / (eps * 6)', '0', 'numerical_grad_kernel_3' ) if xp is chainerx: grads = [ xp.zeros(x.shape, numpy.float64, device=x.device) for x in inputs] else: grads = [xp.zeros(x.shape, numpy.float64) for x in inputs] if detect_nondifferentiable: if center_outputs is None: ys0 = _copy_arrays(f()) else: ys0 = center_outputs nout = len(ys0) shapes = [_.shape for _ in ys0] sizes = numpy.array([_.size for _ in ys0]) cumsizes = numpy.cumsum(sizes) # Evaluate func at a single input def eval_func(x, i, delta, orig): x[i] = orig + delta y = _copy_arrays(f()) assert len(y) == len(grad_outputs) assert all([ gy is None for y_, gy in zip(y, grad_outputs) if y_ is None]) assert all([ gy is None or numpy.isscalar(gy) or y_.shape == gy.shape for y_, gy in zip(y, grad_outputs)]) x[i] = orig return y # An iteration on a single input displacement def iterate_single_input(i_in, x, orig_x, i): orig = orig_x[i] # `yss` holds a list of output arrays for each of 2 or 5 sampling # points. if detect_nondifferentiable: yss = [ eval_func(x, i, -eps * 1., orig), eval_func(x, i, -eps * .5, orig), ys0, eval_func(x, i, +eps * .5, orig), eval_func(x, i, +eps * 1., orig), ] else: yss = [ eval_func(x, i, -eps * 1, orig), eval_func(x, i, +eps * 1, orig), ] if detect_nondifferentiable: # Detect non-differentiable point by quadratic fitting # Check for non-finite output. # If any single element in the output arrays has different # finiteness among sampled points, that means this is a # non-differentiable point. # If the function consistently generates non-finite values # around the point, we do not treat the point as # non-differentiable. # (Example: x<0 region for the logarithm function) any_nonfinite = False for i_out in range(nout): isfinites = [xp.isfinite(ys[i_out]) for ys in yss] if any((isfinites[0] != isfinites[i]).any() for i in range(1, len(yss))): s = six.StringIO() s.write( 'Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(i)) s.write('eps: {}\n'.format(eps)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) any_nonfinite |= not all((_).all() for _ in isfinites) if not any_nonfinite: # Stack flattened outputs to make (5, *)-shaped 2D array ystack = xp.vstack( [xp.hstack([y.ravel() for y in ys]) for ys in yss]) assert ystack.ndim == 2 and ystack.shape[0] == len(yss) # Fit to quadratic if xp is not numpy: ystack = _cpu._to_cpu(ystack) polyfit = numpy.polynomial.polynomial.polyfit _, (residuals, _, _, _) = polyfit( range(len(yss)), ystack, deg=2, full=True) if xp is not numpy: residuals = device.send(residuals) residuals = xp.sqrt(residuals / len(yss)) # Check for error for each output array for i_out in range(nout): size = sizes[i_out] cumsize = cumsizes[i_out] shape = shapes[i_out] # TODO(niboshi): The following two lines could be # rewritten using xp.stack, which is supported in # NumPy>=1.10 ymax = xp.concatenate( [ys[i_out][None] for ys in yss]).max(axis=0) ymin = xp.concatenate( [ys[i_out][None] for ys in yss]).min(axis=0) # Restore the shape of flattened residual res = residuals[cumsize - size:cumsize] res = res.reshape(shape) det = utils.force_array( diff_atol + diff_rtol * (ymax - ymin) < res) # Constant output = not nondifferentiable det[ymax == ymin] = False if det.any(): s = six.StringIO() s.write( 'Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(i)) s.write('eps: {}\n'.format(eps)) s.write('diff_rtol: {}\n'.format(diff_rtol)) s.write('diff_atol: {}\n'.format(diff_atol)) s.write('ymax: {}\n'.format(ymax)) s.write('ymin: {}\n'.format(ymin)) s.write( 'diff_atol + diff_rtol * (ymax-ymin): {}\n'.format( diff_atol + diff_rtol * (ymax - ymin))) s.write('fitting errors: {}\n'.format(res)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) # Calculate numerical gradient for i_out, gy in enumerate(grad_outputs): if gy is None: continue if not numpy.isscalar(gy): gy = gy.astype(numpy.float64, copy=False) gpu_ = (xp is cuda.cupy and all(isinstance(ys[i_out], cuda.ndarray) for ys in yss)) # If any output sample is None, all others must be. assert all([ (yss[0][i_out] is None) == (yss[j][i_out] is None) for j in range(len(yss))]) # If outputs samples are None, the part of numeric gradient for # this output is considered as zero: skip the accumulation. if yss[0][i_out] is None: continue if len(yss) == 2: # 1st order y0 = yss[0][i_out] y1 = yss[1][i_out] if gpu_: numerical_grad_kernel_1( y1, y0, xp.asarray(gy), eps, gx[i]) else: dot = ((y1 - y0) * gy).sum() gx[i] = gx[i] + dot / (2 * eps) elif len(yss) == 5: # 3rd order y0 = yss[0][i_out] y1 = yss[1][i_out] y2 = yss[3][i_out] y3 = yss[4][i_out] if gpu_: numerical_grad_kernel_3( y3, y2, y1, y0, gy, eps, gx[i]) else: num = -y3 + 8 * y2 - 8 * y1 + y0 dot = (num * gy).sum() gx[i] = gx[i] + dot / (6 * eps) else: assert False # Calculate numeric gradient with configuration.using_config('type_check', False): for i_in, (x, gx) in enumerate(six.moves.zip(inputs, grads)): orig_x = x.copy() # hold original value for i in numpy.ndindex(x.shape): iterate_single_input(i_in, x, orig_x, i) return [g.astype(x.dtype, copy=False) for g, x in six.moves.zip(grads, inputs)]
def numerical_grad(f, inputs, grad_outputs, eps=1e-3, detect_nondifferentiable=False, diff_atol=0, diff_rtol=1e-2, center_outputs=None): """Computes numerical gradient by finite differences. This function is used to implement gradient check. For usage example, see unit tests of :mod:`chainer.functions`. By default, ``numerical_grad`` computes the gradient to the first order of ``eps``. Args: f (function): Python function with no arguments that runs forward computation and returns the result. inputs (tuple of arrays): Tuple of arrays that should be treated as inputs. Each element of them is slightly modified to realize numerical gradient by finite differences. grad_outputs (tuple of arrays): Tuple of arrays that are treated as output gradients. eps (float): Epsilon value of finite differences. detect_nondifferentiable (bool): ``False`` by default. If ``True``, ``numerical_grad`` checks whether ``f`` is differentiable at ``inputs``. It requires evaluation of ``f`` at 5 points instead of 2. As a side effect, the accuracy of numerical gradient will be increased to the third order of ``eps``. If it turns out that ``f`` is non-differentiable at ``input``, ``numerical_grad`` raises :class:`~chainer.gradient_check.NondifferentiableError`. diff_atol (float): Absolute tolerance of fitting error of non-differentiable point detection. diff_rtol (float): Tolerance of fitting error of non-differentiable point detection relative to the output values of ``f``. center_outputs (tuple of arrays or None): Only used if ``detect_nondifferentiable`` is ``True``. If specified, these arrays are used as the outputs of ``f`` at ``inputs``. Otherwise, it is calculated. It can be used to reduce the computation if these arrays are already calculated before calling ``numerical_grad``. Returns: tuple: Numerical gradient arrays corresponding to ``inputs``. """ assert eps > 0 for x in inputs: if x.dtype.kind != 'f': raise RuntimeError( 'The dtype of input arrays must be kind of float') inputs = tuple(inputs) grad_outputs = tuple(grad_outputs) gpu = any(isinstance(x, cuda.ndarray) for x in inputs + grad_outputs) cpu = any(isinstance(x, numpy.ndarray) for x in inputs + grad_outputs) if gpu and cpu: raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`') if gpu: xp = cuda.cupy numerical_grad_kernel_1 = cuda.reduce('T y1, T y2, U gy, T eps', 'V gxi', '(y1 - y2) * gy', 'a + b', 'gxi += a / (eps * 2)', '0', 'numerical_grad_kernel_1') numerical_grad_kernel_3 = cuda.reduce( 'T y1, T y2, T y3, T y4, U gy, T eps', 'V gxi', '(-y1 + 8 * y2 - 8 * y3 + y4) * gy', 'a + b', 'gxi += a / (eps * 6)', '0', 'numerical_grad_kernel_3') else: xp = numpy grads = [xp.zeros(x.shape, numpy.float64) for x in inputs] if detect_nondifferentiable: if center_outputs is None: ys0 = _copy_arrays(f()) else: ys0 = center_outputs nout = len(ys0) shapes = [_.shape for _ in ys0] sizes = numpy.array([_.size for _ in ys0]) cumsizes = numpy.cumsum(sizes) # Evaluate func at a single input def eval_func(x, i, delta, orig): x[i] = orig + delta y = _copy_arrays(f()) x[i] = orig return y # An iteration on a single input displacement def iterate_single_input(i_in, x, orig_x, i): orig = orig_x[i] # `yss` holds a list of output arrays for each of 2 or 5 sampling # points. if detect_nondifferentiable: yss = [ eval_func(x, i, -eps * 1., orig), eval_func(x, i, -eps * .5, orig), ys0, eval_func(x, i, +eps * .5, orig), eval_func(x, i, +eps * 1., orig), ] else: yss = [ eval_func(x, i, -eps * 1, orig), eval_func(x, i, +eps * 1, orig), ] if detect_nondifferentiable: # Detect non-differentiable point by quadratic fitting # Check for non-finite output. # If any single element in the output arrays has different # finiteness among sampled points, that means this is a # non-differentiable point. # If the function consistently generates non-finite values # around the point, we do not treat the point as # non-differentiable. # (Example: x<0 region for the logarithm function) any_nonfinite = False for i_out in range(nout): isfinites = [xp.isfinite(ys[i_out]) for ys in yss] if any((isfinites[0] != isfinites[i]).any() for i in range(1, len(yss))): s = six.StringIO() s.write('Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(i)) s.write('eps: {}\n'.format(eps)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) any_nonfinite |= not all(_.all() for _ in isfinites) if not any_nonfinite: # Stack flattenend outputs to make (5, *)-shaped 2D array ystack = xp.vstack( [xp.hstack([y.ravel() for y in ys]) for ys in yss]) assert ystack.ndim == 2 and ystack.shape[0] == len(yss) # Fit to quadratic if gpu: ystack = ystack.get() polyfit = numpy.polynomial.polynomial.polyfit _, (residuals, _, _, _) = polyfit(range(len(yss)), ystack, deg=2, full=True) if gpu: residuals = xp.array(residuals) residuals = xp.sqrt(residuals / len(yss)) # Check for error for each output array for i_out in range(nout): size = sizes[i_out] cumsize = cumsizes[i_out] shape = shapes[i_out] ymax = xp.stack([ys[i_out] for ys in yss]).max(axis=0) ymin = xp.stack([ys[i_out] for ys in yss]).min(axis=0) # Restore the shape of flattened residual res = residuals[cumsize - size:cumsize] res = res.reshape(shape) det = xp.asarray(diff_atol + diff_rtol * (ymax - ymin) < res) # Constant output = not nondifferentiable det[ymax == ymin] = False if det.any(): s = six.StringIO() s.write('Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(i)) s.write('eps: {}\n'.format(eps)) s.write('diff_rtol: {}\n'.format(diff_rtol)) s.write('diff_atol: {}\n'.format(diff_atol)) s.write('ymax: {}\n'.format(ymax)) s.write('ymin: {}\n'.format(ymin)) s.write( 'diff_atol + diff_rtol * (ymax-ymin): {}\n'.format( diff_atol + diff_rtol * (ymax - ymin))) s.write('fitting errors: {}\n'.format(res)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) # Calculate numerical gradient for i_out, gy in enumerate(grad_outputs): if gy is None: continue gpu_ = (gpu and all(isinstance(ys[i_out], cuda.ndarray) for ys in yss)) if len(yss) == 2: # 1st order y0 = yss[0][i_out] y1 = yss[1][i_out] if gpu_: numerical_grad_kernel_1(y1, y0, xp.asarray(gy), eps, gx[i]) else: dot = ((y1 - y0) * gy).sum() gx[i] += dot / (2 * eps) elif len(yss) == 5: # 3rd order y0 = yss[0][i_out] y1 = yss[1][i_out] y2 = yss[3][i_out] y3 = yss[4][i_out] if gpu_: numerical_grad_kernel_3(y3, y2, y1, y0, gy, eps, gx[i]) else: num = -y3 + 8 * y2 - 8 * y1 + y0 dot = (num * gy).sum() gx[i] += dot / (6 * eps) else: assert False # Calculate numeric gradient with configuration.using_config('type_check', False): for i_in, (x, gx) in enumerate(six.moves.zip(inputs, grads)): orig_x = x.copy() # hold original value for i in numpy.ndindex(x.shape): iterate_single_input(i_in, x, orig_x, i) return [ g.astype(x.dtype, copy=False) for g, x in six.moves.zip(grads, inputs) ]