def forward_cpu(self, inputs): x, t = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = numpy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= _broadcast_to(self.class_weight.reshape(shape), x.shape) log_yd = numpy.rollaxis(log_y, 1) log_yd = log_yd.reshape(len(log_yd), -1) t_valid = t != self.ignore_label t = t * t_valid log_p = log_yd[t.ravel(), numpy.arange(t.size)] log_p *= t_valid.ravel() if self.reduce == 'mean': # deal with the case where the SoftmaxCrossEntropy is # unpickled from the old version if self.normalize: count = t_valid.sum() else: count = len(x) self._coeff = 1.0 / max(count, 1) y = log_p.sum(keepdims=True) * (-self._coeff) return y.reshape(()), else: return -log_p.reshape(t.shape),
def forward_cpu(self, inputs): x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) log_y = log_softmax._log_softmax(x, self.use_cudnn) if self.cache_score: self.y = numpy.exp(log_y) if self.class_weight is not None: if self.class_weight.shape != x.shape: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] self.class_weight = numpy.broadcast_to( self.class_weight.reshape(shape), x.shape) log_y *= self.class_weight log_yd = numpy.rollaxis(log_y, 1) log_yd = log_yd.reshape(len(log_yd), -1) log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)] # deal with the case where the SoftmaxCrossEntropy is # unpickled from the old version if self.normalize: count = (t != self.ignore_label).sum() else: count = len(x) self._coeff = 1.0 / max(count, 1) y = (log_p * (t.ravel() != self.ignore_label)).sum(keepdims=True) \ * (-self._coeff) return y.reshape(()),
def forward_cpu(self, inputs): x, t = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = numpy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= _broadcast_to(self.class_weight.reshape(shape), x.shape) log_yd = numpy.rollaxis(log_y, 1) log_yd = log_yd.reshape(len(log_yd), -1) log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)] log_p *= (t.ravel() != self.ignore_label) if self.reduce == 'mean': # deal with the case where the SoftmaxCrossEntropy is # unpickled from the old version if self.normalize: count = (t != self.ignore_label).sum() else: count = len(x) self._coeff = 1.0 / max(count, 1) y = log_p.sum(keepdims=True) * (-self._coeff) return y.reshape(()), else: return -log_p.reshape(t.shape),
def backward_cpu(self, inputs, grad_outputs): x, t = inputs gloss = grad_outputs[0] n_unit = t.size // len(t) if hasattr(self, 'y'): y = self.y.copy() else: y = log_softmax._log_softmax(x, self.use_cudnn) numpy.exp(y, out=y) if y.ndim == 2: gx = y gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1 gx *= (t != self.ignore_label).reshape((len(t), 1)) else: # in the case where y.ndim is higher than 2, # we think that a current implementation is inefficient # because it yields two provisional arrays for indexing. gx = y.reshape(y.shape[0], y.shape[1], -1) fst_index = numpy.arange(t.size) // n_unit trd_index = numpy.arange(t.size) % n_unit gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1 gx *= (t != self.ignore_label).reshape((len(t), 1, -1)) gx = gx.reshape(y.shape) gx *= gloss * self._coeff return gx, None
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t = inputs if hasattr(self, 'y'): y = self.y else: y = log_softmax._log_softmax(x, self.use_cudnn) cupy.exp(y, out=y) gloss = grad_outputs[0] n_unit = t.size // len(t) coeff = gloss * self._coeff if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, raw T coeff, S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = (t == ignore_label) ? 0 : (coeff[0] * (y - (c == t))); ''', 'softmax_crossent_bwd')(y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) else: gx = cuda.elementwise( 'T y, raw T w, S t, raw T coeff, S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff[0] * (y - (c == t)) * w[t]; ''', 'softmax_crossent_bwd')(y, self.class_weight, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) return gx, None
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t = inputs if hasattr(self, 'y'): y = self.y else: y = log_softmax._log_softmax(x, self.use_cudnn) cupy.exp(y, out=y) gloss = grad_outputs[0] n_unit = t.size // len(t) coeff = gloss * self._coeff if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, raw T coeff, S n_channel, S n_unit', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = (t == -1) ? 0 : (coeff[0] * (y - (c == t))); ''', 'softmax_crossent_bwd')( y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit) else: gx = cuda.elementwise( 'T y, raw T w, S t, raw T coeff, S n_channel, S n_unit', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == -1 ? 0 : coeff[0] * (y - (c == t)) * w[t]; ''', 'softmax_crossent_bwd')( y, self.class_weight, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit) return gx, None
def backward_cpu(self, inputs, grad_outputs): x, t = inputs gloss = grad_outputs[0] n_unit = t.size // len(t) if hasattr(self, 'y'): y = self.y.copy() else: y = log_softmax._log_softmax(x, self.use_cudnn) y = numpy.exp(y, out=y) if y.ndim == 2: gx = y gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1 gx *= (t != self.ignore_label).reshape((len(t), 1)) else: # in the case where y.ndim is higher than 2, # we think that a current implementation is inefficient # because it yields two provisional arrays for indexing. gx = y.reshape(y.shape[0], y.shape[1], -1) fst_index = numpy.arange(t.size) // n_unit trd_index = numpy.arange(t.size) % n_unit gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1 gx *= (t != self.ignore_label).reshape((len(t), 1, -1)) gx = gx.reshape(y.shape) gx *= gloss * self._coeff return gx, None
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t = inputs if hasattr(self, 'y'): y = self.y else: y = log_softmax._log_softmax(x, self.use_cudnn) cupy.exp(y, out=y) gloss = grad_outputs[0] n_unit = t.size // len(t) if self.reduce == 'mean': coeff = gloss * self._coeff else: coeff = gloss[:, None, ...] if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, T coeff, S n_channel, S n_unit', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == -1 ? 0 : coeff * (y - (c == t)); ''', 'softmax_crossent_bwd')(y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit) else: gx = cuda.elementwise( 'T y, raw T w, S t, T coeff, S n_channel, S n_unit', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == -1 ? 0 : coeff * (y - (c == t)) * w[t]; ''', 'softmax_crossent_weight_bwd')(y, self.class_weight, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit) return gx, None
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) log_y = log_softmax._log_softmax(x, self.use_cudnn) if self.cache_score: self.y = cupy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to( self.class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out', 't == -1 ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff) return ret,
def forward_cpu(self, inputs): x, t = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = numpy.exp(log_y) if self.class_weight is not None: log_y *= self.class_weight log_yd = numpy.rollaxis(log_y, 1) log_yd = log_yd.reshape(len(log_yd), -1) log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)] log_p *= (t.ravel() != self.ignore_label) if self.reduce == 'mean': # deal with the case where the SoftmaxCrossEntropy is # unpickled from the old version if self.normalize: count = (t != self.ignore_label).sum() else: count = len(x) self._coeff = 1.0 / max(count, 1) y = log_p.sum(keepdims=True) * (-self._coeff) return y.reshape(()), else: return -log_p.reshape(t.shape),
def forward_gpu(self, inputs): cupy = cuda.cupy x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) log_y = log_softmax._log_softmax(x, self.use_cudnn) if self.cache_score: self.y = cupy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to(self.class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) ret = cuda.reduce('S t, raw T log_y, int32 n_channel, raw T coeff', 'T out', 't == -1 ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff) return ret,
def forward_gpu(self, inputs): class_weight = backend.from_chx(self.class_weight) self.retain_inputs((0, 1)) cupy = cuda.cupy x, t = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) if x.size == 0: y = cupy.zeros(t.shape, dtype=x.dtype) if self.cache_score: self.y = y if self.reduce == 'mean': return y.sum(), else: return y, log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = cupy.exp(log_y) if class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': # Reduction is performed in a promoted dtype reduc_dtype = _reduction_dtype(x.dtype) if self.normalize: count = (t != self.ignore_label).sum(dtype=reduc_dtype) count = cupy.maximum(1, count) coeff = 1. / count else: coeff = cupy.array(1. / max(1, len(t)), dtype=reduc_dtype) self._coeff = coeff ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw U coeff, ' 'S ignore_label', 'U out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = static_cast<U>(a * -coeff[0])', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) ret = ret.astype(log_y.dtype, copy=False) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def backward_cpu(self, inputs, grad_outputs): if len(inputs) == 2: (x, t), tt = inputs, None else: x, t, tt = inputs gloss = grad_outputs[0] if x.size == 0: return numpy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y.copy() else: y = log_softmax._log_softmax(x) numpy.exp(y, out=y) if self.train_threshold is not None: _yt = y[numpy.arange(len(t)), t] # print('# _yt: {}'.format(_yt)) _scale = (1.0 - _yt / self.train_threshold) / (1.0 - _yt + 1e-5) _scale[_scale < 0.0] = 0.0 # print('# _scale: {}'.format(_scale)) if y.ndim == 2: gx = y if tt is None: gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1 else: gx -= tt if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) c = c[numpy.arange(len(t)), numpy.maximum(t, 0)] gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape) gx *= (t != self.ignore_label).reshape((len(t), 1)) if self.train_threshold is not None: # print('# gx:\n{}'.format(gx)) gx *= _scale.reshape((len(t), 1)) # print('# gx:\n{}'.format(gx)) else: # in the case where y.ndim is higher than 2, # we think that a current implementation is inefficient # because it yields two provisional arrays for indexing. n_unit = t.size // len(t) gx = y.reshape(y.shape[0], y.shape[1], -1) fst_index = numpy.arange(t.size) // n_unit trd_index = numpy.arange(t.size) % n_unit gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) c = c.reshape(gx.shape) c = c[fst_index, numpy.maximum(t.ravel(), 0), trd_index] c = c.reshape(y.shape[0], 1, -1) gx *= _broadcast_to(c, gx.shape) gx *= (t != self.ignore_label).reshape((len(t), 1, -1)) gx = gx.reshape(y.shape) if self.reduce == 'mean': gx *= gloss * self._coeff else: gx *= gloss[:, None] return gx, None
def forward(self, inputs): xp = cuda.get_array_module(*inputs) x, t = inputs log_y = log_softmax._log_softmax(x) self.y = xp.exp(log_y) count = (t != self.ignore_label).sum() self._coeff = 1.0 / max(count, 1) return xp.array([1.], dtype=xp.float32),
def forward_gpu(self, inputs_and_grad_outputs): class_weight = cuda.to_gpu(self.class_weight) cupy = cuda.cupy x, t, gloss = inputs_and_grad_outputs if x.size == 0: return cupy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y else: y = log_softmax._log_softmax(x) cupy.exp(y, out=y) n_unit = t.size // len(t) if self.coeff is not None: coeff = self.coeff else: gloss = gloss[:, None, ...] coeff = cupy.array(1, dtype=gloss.dtype) # dtype does not matter if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, T gloss, U coeff, S n_channel, S n_unit, ' 'S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); if (t == ignore_label) { gx = T(0); } else { gx = static_cast<T>(gloss * coeff * (y - (c == t))); } ''', 'softmax_crossent_bwd')( y, cupy.expand_dims(t, 1), gloss, coeff, x.shape[1], n_unit, self.ignore_label) else: gx = cuda.elementwise( 'T y, raw T w, S t, T gloss, U coeff, ' 'S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); if (t == ignore_label) { gx = T(0); } else { gx = static_cast<T>( gloss * coeff * (y - (c == t)) * w[t]); } ''', 'softmax_crossent_weight_bwd')( y, class_weight, cupy.expand_dims(t, 1), gloss, coeff, x.shape[1], n_unit, self.ignore_label) return gx,
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy if len(inputs) == 2: (x, t), tt = inputs, None else: x, t, tt = inputs if x.size == 0: return cupy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y else: y = log_softmax._log_softmax(x) cupy.exp(y, out=y) if self.train_threshold is not None: _yt = y[cupy.arange(len(t)), t] _scale = (1.0 - _yt / self.train_threshold) / (1.0 - _yt + 1e-5) _scale[_scale < 0.0] = 0.0 gloss = grad_outputs[0] n_unit = t.size // len(t) if self.reduce == 'mean': coeff = gloss * self._coeff else: coeff = gloss[:, None, ...] if self.class_weight is None: if tt is None: gx = cuda.elementwise( 'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)); ''', 'softmax_crossent_bwd')(y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) else: # print('# tt:{}'.format(tt)) # print('# tt:{}'.format(tt.sum(axis=1))) gx = coeff * (y - tt) else: gx = cuda.elementwise( 'T y, raw T w, S t, T coeff, S n_channel, S n_unit, ' 'S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t]; ''', 'softmax_crossent_weight_bwd')(y, self.class_weight, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) if self.train_threshold is not None: gx *= _scale.reshape((len(t), 1)) return gx, None
def forward_gpu(self, inputs): class_weight = backend.from_chainerx(self.class_weight) self.retain_inputs((0, 1)) cupy = cuda.cupy x, t = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) if x.size == 0: y = cupy.zeros(t.shape, dtype=x.dtype) if self.cache_score: self.y = y if self.reduce == 'mean': return y.sum(), else: return y, log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = cupy.exp(log_y) if class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, ' 'S ignore_label', 'T out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd' )(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def forward_gpu(self, inputs): cupy = cuda.cupy if len(inputs) == 2: (x, t), tt = inputs, None else: x, t, tt = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) if x.size == 0: y = cupy.zeros(t.shape, dtype=x.dtype) if self.cache_score: self.y = y if self.reduce == 'mean': return y.sum(), else: return y, log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = cupy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to(self.class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, (t != self.ignore_label).sum()) else: coeff = max(1, len(t)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) log_y = cupy.rollaxis(log_y, 1, log_y.ndim) if self.reduce == 'mean': ret = cuda.reduce( 'S t, raw T log_y, int32 n_channel, raw T coeff, ' 'S ignore_label', 'T out', 't == ignore_label ? T(0) : log_y[_j * n_channel + t]', 'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self._coeff, self.ignore_label) else: ret = cuda.elementwise( 'S t, raw T log_y, int32 n_channel, T ignore', 'T out', ''' if (t == ignore) { out = 0; } else { out = -log_y[i * n_channel + t]; } ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label) ret = ret.reshape(t.shape) return ret,
def forward_gpu(self, inputs_and_grad_outputs): class_weight = cuda.to_gpu(self.class_weight) cupy = cuda.cupy x, t, gloss = inputs_and_grad_outputs if x.size == 0: return cupy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y else: y = log_softmax._log_softmax(x) cupy.exp(y, out=y) n_unit = t.size // len(t) if self.coeff is not None: coeff = self.coeff else: gloss = gloss[:, None, ...] coeff = cupy.array(1, dtype=gloss.dtype) # dtype does not matter if self.soft_target: gx = gloss * coeff * (y - t) elif self.class_weight is None: gx = cuda.elementwise( 'T y, S t, T gloss, U coeff, S n_channel, S n_unit, ' 'S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); if (t == ignore_label) { gx = T(0); } else { gx = static_cast<T>(gloss * coeff * (y - (c == t))); } ''', 'softmax_crossent_bwd')(y, cupy.expand_dims(t, 1), gloss, coeff, x.shape[1], n_unit, self.ignore_label) else: gx = cuda.elementwise( 'T y, raw T w, S t, T gloss, U coeff, ' 'S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); if (t == ignore_label) { gx = T(0); } else { gx = static_cast<T>( gloss * coeff * (y - (c == t)) * w[t]); } ''', 'softmax_crossent_weight_bwd')(y, class_weight, cupy.expand_dims(t, 1), gloss, coeff, x.shape[1], n_unit, self.ignore_label) return gx,
def backward_cpu(self, inputs, grad_outputs): x, t = inputs gloss = grad_outputs[0] if hasattr(self, 'y'): y = self.y.copy() else: y = log_softmax._log_softmax(x, self.use_cudnn) numpy.exp(y, out=y) if y.ndim == 2: gx = y # Improve me # It is disabled by default if mkld.enable_softmax_cross_entropyF(inputs): mkldnn_sce_bwd = mkldnn.SoftmaxCrossEntropy_F32_softmax_cross_entropy_create_backward( gx.shape) mkldnn_sce_bwd.backward(gx.ravel(), t.ravel(), gx.shape) else: gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = numpy.broadcast_to(self.class_weight.reshape(shape), x.shape) c = c[numpy.arange(len(t)), numpy.maximum(t, 0)] gx *= numpy.broadcast_to(numpy.expand_dims(c, 1), gx.shape) gx *= (t != self.ignore_label).reshape((len(t), 1)) else: # in the case where y.ndim is higher than 2, # we think that a current implementation is inefficient # because it yields two provisional arrays for indexing. n_unit = t.size // len(t) gx = y.reshape(y.shape[0], y.shape[1], -1) fst_index = numpy.arange(t.size) // n_unit trd_index = numpy.arange(t.size) % n_unit gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = numpy.broadcast_to(self.class_weight.reshape(shape), x.shape) c = c.reshape(gx.shape) c = c[fst_index, numpy.maximum(t.ravel(), 0), trd_index] c = c.reshape(y.shape[0], 1, -1) gx *= numpy.broadcast_to(c, gx.shape) gx *= (t != self.ignore_label).reshape((len(t), 1, -1)) gx = gx.reshape(y.shape) gx *= gloss * self._coeff return gx, None
def forward_cpu(self, inputs_and_grad_outputs): x, t, gloss = inputs_and_grad_outputs if x.size == 0: return numpy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y.copy() else: y = log_softmax._log_softmax(x) numpy.exp(y, out=y) t_valid = t != self.ignore_label t = t * t_valid if self.soft_target: gx = y - t elif y.ndim == 2: gx = y gx[numpy.arange(len(t)), t] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) c = c[numpy.arange(len(t)), t] gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape) gx *= t_valid.reshape((len(t), 1)) else: # in the case where y.ndim is higher than 2, # we think that a current implementation is inefficient # because it yields two provisional arrays for indexing. n_unit = t.size // len(t) gx = y.reshape(y.shape[0], y.shape[1], -1) fst_index = numpy.arange(t.size) // n_unit trd_index = numpy.arange(t.size) % n_unit gx[fst_index, t.ravel(), trd_index] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) c = c.reshape(gx.shape) c = c[fst_index, t.ravel(), trd_index] c = c.reshape(y.shape[0], 1, -1) gx *= _broadcast_to(c, gx.shape) gx *= t_valid.reshape((len(t), 1, -1)) gx = gx.reshape(y.shape) if self.coeff is not None: gx *= gloss * self.coeff else: gx *= gloss[:, None] return gx,
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, soft_label = inputs if x.size == 0: return cupy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y else: y = log_softmax._log_softmax(x) cupy.exp(y, out=y) gloss = grad_outputs[0] coeff = gloss * self._coeff if self.class_weight is None: gx = (y - soft_label) * coeff else: gx = (y - soft_label) * self.class_weight * coeff return gx, None
def backward_cpu(self, inputs, grad_outputs): x, t = inputs gloss = grad_outputs[0] if x.size == 0: return numpy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y.copy() else: y = log_softmax._log_softmax(x) numpy.exp(y, out=y) t_valid = t != self.ignore_label t = t * t_valid if y.ndim == 2: gx = y gx[numpy.arange(len(t)), t] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) c = c[numpy.arange(len(t)), t] gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape) gx *= t_valid.reshape((len(t), 1)) else: # in the case where y.ndim is higher than 2, # we think that a current implementation is inefficient # because it yields two provisional arrays for indexing. n_unit = t.size // len(t) gx = y.reshape(y.shape[0], y.shape[1], -1) fst_index = numpy.arange(t.size) // n_unit trd_index = numpy.arange(t.size) % n_unit gx[fst_index, t.ravel(), trd_index] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) c = c.reshape(gx.shape) c = c[fst_index, t.ravel(), trd_index] c = c.reshape(y.shape[0], 1, -1) gx *= _broadcast_to(c, gx.shape) gx *= t_valid.reshape((len(t), 1, -1)) gx = gx.reshape(y.shape) if self.reduce == 'mean': gx *= gloss * self._coeff else: gx *= gloss[:, None] return gx, None
def backward_cpu(self, inputs, grad_outputs): x, t = inputs gloss = grad_outputs[0] if hasattr(self, 'y'): y = self.y.copy() else: y = log_softmax._log_softmax(x, self.use_cudnn) numpy.exp(y, out=y) if y.ndim == 2: gx = y gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) c = c[numpy.arange(len(t)), numpy.maximum(t, 0)] gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape) gx *= (t != self.ignore_label).reshape((len(t), 1)) else: # in the case where y.ndim is higher than 2, # we think that a current implementation is inefficient # because it yields two provisional arrays for indexing. n_unit = t.size // len(t) gx = y.reshape(y.shape[0], y.shape[1], -1) fst_index = numpy.arange(t.size) // n_unit trd_index = numpy.arange(t.size) % n_unit gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) c = c.reshape(gx.shape) c = c[fst_index, numpy.maximum(t.ravel(), 0), trd_index] c = c.reshape(y.shape[0], 1, -1) gx *= _broadcast_to(c, gx.shape) gx *= (t != self.ignore_label).reshape((len(t), 1, -1)) gx = gx.reshape(y.shape) if self.reduce == 'mean': gx *= gloss * self._coeff else: gx *= gloss[:, None] # weight gx *= self.weight.reshape((len(y), 1)) return gx, None
def forward_gpu(self, inputs_and_grad_outputs): class_weight = cuda.to_gpu(self.class_weight) cupy = cuda.cupy x, t, gloss = inputs_and_grad_outputs if x.size == 0: return cupy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y else: y = log_softmax._log_softmax(x) cupy.exp(y, out=y) n_unit = t.size // len(t) if self.reduce == 'mean': coeff = gloss * self.coeff else: coeff = gloss[:, None, ...] if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)); ''', 'softmax_crossent_bwd')( y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) else: gx = cuda.elementwise( 'T y, raw T w, S t, T coeff, S n_channel, S n_unit, ' 'S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t]; ''', 'softmax_crossent_weight_bwd')( y, class_weight, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) return gx,
def backward_cpu(self, inputs, grad_outputs): x, soft_label = inputs gloss = grad_outputs[0] if x.size == 0: return numpy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y.copy() else: y = log_softmax._log_softmax(x) numpy.exp(y, out=y) gx = y gx -= soft_label if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) gx *= _broadcast_to(numpy.expand_dims(c, 1), gx.shape) gx *= gloss * self._coeff return gx, None
def forward_cpu(self, inputs): class_weight = backend.from_chx(self.class_weight) self.retain_inputs((0, 1)) x, t = inputs if x.ndim == t.ndim and x.shape == t.shape: self.soft_target = True if chainer.is_debug() and not self.soft_target: _check_input_values(x, t, self.ignore_label) log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = numpy.exp(log_y) if self.soft_target: return self._soft_target_loss(numpy, x, t, log_y) if class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= _broadcast_to(class_weight.reshape(shape), x.shape) log_yd = numpy.rollaxis(log_y, 1) log_yd = log_yd.reshape(len(log_yd), -1) t_valid = t != self.ignore_label t = t * t_valid log_p = log_yd[t.ravel(), numpy.arange(t.size)] log_p *= t_valid.ravel() if self.reduce == 'mean': if self.normalize: count = t_valid.sum() else: count = len(x) self._coeff = 1.0 / max(count, 1) # Perform reduction in a promoted dtype reduc_dtype = _reduction_dtype(x.dtype) y = log_p.sum(keepdims=True, dtype=reduc_dtype) y = y * (-self._coeff) y = y.astype(x.dtype, copy=False) return y.reshape(()), else: return -log_p.reshape(t.shape),
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t = inputs if x.size == 0: return cupy.zeros(x.shape, dtype=x.dtype), None if self.y is not None: y = self.y else: y = log_softmax._log_softmax(x) cupy.exp(y, out=y) gloss = grad_outputs[0] n_unit = t.size // len(t) if self.reduce == 'mean': coeff = gloss * self._coeff else: coeff = gloss[:, None, ...] if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)); ''', 'softmax_crossent_bwd')( y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) else: gx = cuda.elementwise( 'T y, raw T w, S t, T coeff, S n_channel, S n_unit, ' 'S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t]; ''', 'softmax_crossent_weight_bwd')( y, self.class_weight, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) return gx, None
def forward_gpu(self, inputs): cupy = cuda.cupy x, soft_label = inputs if chainer.is_debug(): _check_input_values(x, soft_label) log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = cupy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= cupy.broadcast_to(self.class_weight.reshape(shape), x.shape) if self.normalize: coeff = cupy.maximum(1, soft_label.shape[1]) else: coeff = max(1, len(soft_label)) self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype) ret = -cupy.sum(soft_label * log_y) * self._coeff return ret,
def forward_cpu(self, inputs): x, soft_label = inputs if chainer.is_debug(): _check_input_values(x, soft_label) log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = numpy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= _broadcast_to(self.class_weight.reshape(shape), x.shape) log_p = numpy.array([numpy.sum(log_y * soft_label)]) if self.normalize: count = x.shape[1] else: count = len(x) self._coeff = 1.0 / max(count, 1) y = log_p.sum(keepdims=True) * (-self._coeff) return y.reshape(()),
def backward_gpu(self, inputs, grad_outputs): cupy = cuda.cupy x, t = inputs if hasattr(self, 'y'): y = self.y else: y = log_softmax._log_softmax(x) cupy.exp(y, out=y) gloss = grad_outputs[0] n_unit = t.size // len(t) if self.reduce == 'mean': coeff = gloss * self._coeff else: coeff = gloss[:, None, ...] if self.class_weight is None: gx = cuda.elementwise( 'T y, S t, T coeff, S n_channel, S n_unit, S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)); ''', 'softmax_crossent_bwd')( y, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) else: gx = cuda.elementwise( 'T y, raw T w, S t, T coeff, S n_channel, S n_unit, ' 'S ignore_label', 'T gx', ''' const int c = (i / n_unit % n_channel); gx = t == ignore_label ? 0 : coeff * (y - (c == t)) * w[t]; ''', 'softmax_crossent_weight_bwd')( y, self.class_weight, cupy.expand_dims(t, 1), coeff, x.shape[1], n_unit, self.ignore_label) return gx, None
def backward_cpu(self, inputs, grad_outputs): x, t = inputs gloss = grad_outputs[0] if hasattr(self, 'y'): y = self.y.copy() else: y = log_softmax._log_softmax(x, self.use_cudnn) numpy.exp(y, out=y) if y.ndim == 2: gx = y gx[numpy.arange(len(t)), numpy.maximum(t, 0)] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = numpy.broadcast_to( self.class_weight.reshape(shape), x.shape) c = c[numpy.arange(len(t)), numpy.maximum(t, 0)] gx *= numpy.broadcast_to(numpy.expand_dims(c, 1), gx.shape) gx *= (t != self.ignore_label).reshape((len(t), 1)) else: # in the case where y.ndim is higher than 2, # we think that a current implementation is inefficient # because it yields two provisional arrays for indexing. n_unit = t.size // len(t) gx = y.reshape(y.shape[0], y.shape[1], -1) fst_index = numpy.arange(t.size) // n_unit trd_index = numpy.arange(t.size) % n_unit gx[fst_index, numpy.maximum(t.ravel(), 0), trd_index] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = numpy.broadcast_to( self.class_weight.reshape(shape), x.shape) c = c.reshape(gx.shape) c = c[fst_index, numpy.maximum(t.ravel(), 0), trd_index] c = c.reshape(y.shape[0], 1, -1) gx *= numpy.broadcast_to(c, gx.shape) gx *= (t != self.ignore_label).reshape((len(t), 1, -1)) gx = gx.reshape(y.shape) gx *= gloss * self._coeff return gx, None
def forward_cpu(self, inputs): class_weight = backend.from_chx(self.class_weight) self.retain_inputs((0, 1)) x, t = inputs if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) log_y = log_softmax._log_softmax(x) if self.cache_score: self.y = numpy.exp(log_y) if class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= _broadcast_to(class_weight.reshape(shape), x.shape) log_yd = numpy.rollaxis(log_y, 1) log_yd = log_yd.reshape(len(log_yd), -1) t_valid = t != self.ignore_label t = t * t_valid log_p = log_yd[t.ravel(), numpy.arange(t.size)] log_p *= t_valid.ravel() if self.reduce == 'mean': # deal with the case where the SoftmaxCrossEntropy is # unpickled from the old version if self.normalize: count = t_valid.sum() else: count = len(x) self._coeff = 1.0 / max(count, 1) # Perform reduction in a promoted dtype reduc_dtype = _reduction_dtype(x.dtype) y = log_p.sum(keepdims=True, dtype=reduc_dtype) y = y * (-self._coeff) y = y.astype(x.dtype, copy=False) return y.reshape(()), else: return -log_p.reshape(t.shape),
def forward_cpu(self, inputs): x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) log_y = log_softmax._log_softmax(x, self.use_cudnn) if self.cache_score: self.y = numpy.exp(log_y) log_yd = numpy.rollaxis(log_y, 1) log_yd = log_yd.reshape(len(log_yd), -1) log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)] # deal with the case where the SoftmaxCrossEntropy is # unpickled from the old version if self.normalize: count = (t != self.ignore_label).sum() else: count = len(x) self._coeff = 1.0 / max(count, 1) y = (log_p * (t.ravel() != self.ignore_label)).sum(keepdims=True) \ * (-self._coeff) return y.reshape(()),
def forward_cpu(self, inputs): x, t = inputs if chainer.is_debug(): self._check_input_values(x, t) # Improve me # It is disabled by default if mkld.enable_softmax_cross_entropyF(inputs): y_out = numpy.empty(x.shape, dtype=numpy.float32) mkldnn_sce_fwd = mkldnn.SoftmaxCrossEntropy_F32_softmax_cross_entropy_create_forward( x.shape) mkldnn_sce_fwd.forward(x.ravel(), y_out.ravel(), x.shape) log_y = y_out else: log_y = log_softmax._log_softmax(x, self.use_cudnn) if self.cache_score: self.y = numpy.exp(log_y) if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] log_y *= numpy.broadcast_to(self.class_weight.reshape(shape), x.shape) log_yd = numpy.rollaxis(log_y, 1) log_yd = log_yd.reshape(len(log_yd), -1) log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)] # deal with the case where the SoftmaxCrossEntropy is # unpickled from the old version if self.normalize: count = (t != self.ignore_label).sum() else: count = len(x) self._coeff = 1.0 / max(count, 1) y = (log_p * (t.ravel() != self.ignore_label)).sum(keepdims=True) \ * (-self._coeff) return y.reshape(()),
def backward_cpu(self, inputs, grad_outputs): x, t = inputs gloss = grad_outputs[0] if hasattr(self, 'y'): y = self.y.copy() else: y = log_softmax._log_softmax(x) np.exp(y, out=y) if y.ndim == 2: gx = y gx[np.arange(len(t)), np.maximum(t, 0)] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) c = c[np.arange(len(t)), np.maximum(t, 0)] gx *= _broadcast_to(np.expand_dims(c, 1), gx.shape) gx *= (t != self.ignore_label).reshape((len(t), 1)) else: n_unit = t.size // len(t) gx = y.reshape(y.shape[0], y.shape[1], -1) fst_index = np.arange(t.size) // n_unit trd_index = np.arange(t.size) % n_unit gx[fst_index, np.maximum(t.ravel(), 0), trd_index] -= 1 if self.class_weight is not None: shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)] c = _broadcast_to(self.class_weight.reshape(shape), x.shape) c = c.reshape(gx.shape) c = c[fst_index, np.maximum(t.ravel(), 0), trd_index] c = c.reshape(y.shape[0], 1, -1) gx *= _broadcast_to(c, gx.shape) gx *= (t != self.ignore_label).reshape((len(t), 1, -1)) gx = gx.reshape(y.shape) if self.reduce == 'mean': gx *= gloss * self._coeff else: gx *= gloss[:, None] return gx, None
def forward(self, inputs): x, t = inputs[:2] rest = len(inputs) - 2 head_W, Ws = inputs[2], inputs[3:2 + (rest - 1) // 2 + 1] Rs = inputs[2 + (rest - 1) // 2 + 1:] n_tails = len(Rs) # minus_inf = -1024. minus_inf = -numpy.inf xp = cuda.get_array_module(x) if chainer.is_debug(): _check_input_values(x, t, self.ignore_label) self.retain_inputs(tuple(six.moves.range(len(inputs)))) cluster_hots = [] for i in six.moves.range(1, n_tails + 1): lower, upper = self.cutoff[i], self.cutoff[i + 1] in_cluster = xp.logical_and(lower <= t, t < upper) if self.output_all: in_cluster = xp.ones( in_cluster.shape, dtype=in_cluster.dtype) cluster_hots.append(in_cluster) self.cluster_hots = cluster_hots self.head = self.linear(x, head_W) self.ls_head = log_softmax._log_softmax(self.head) self.reduced_xs = [] self.tails = [] self.ls_tails = [] for i, in_cluster in enumerate(cluster_hots, start=1): tail_idx = i - 1 if xp.any(in_cluster): reduced_x = self.linear(x[in_cluster], Rs[tail_idx]) self.reduced_xs.append(reduced_x) out = self.linear(reduced_x, Ws[tail_idx]) self.tails.append(out) ls_out = log_softmax._log_softmax(out) self.ls_tails.append(ls_out) else: self.reduced_xs.append(None) self.tails.append(None) self.ls_tails.append(None) n_head_out = head_W.shape[0] - n_tails n_out = n_head_out + sum(W.shape[0] for W in Ws) shape = (x.shape[0], n_out) log_y = xp.full(shape, minus_inf, dtype=x.dtype) log_y[:, :n_head_out] = self.ls_head[:, :n_head_out] for i, (in_cluster, tail) in enumerate( zip(cluster_hots, self.ls_tails), start=1): if tail is None: continue lower, upper = self.cutoff[i], self.cutoff[i + 1] tail_main = self.ls_head[:, n_head_out + i - 1] tail_main_in = xp.broadcast_to( tail_main[in_cluster][:, None], tail.shape) log_y[xp.nonzero(in_cluster)[0], lower:upper] = tail_main_in + tail not_in_cluster = xp.logical_not(in_cluster) log_y[xp.nonzero(not_in_cluster)[0], lower] = tail_main[not_in_cluster] return log_y,