Exemplo n.º 1
0
    def __call__(self, *xs):
        """Applies broadcasted elementwise product.

        Args:
            xs (list of Variables): Input variables whose length should
                be one if the link has a learnable weight parameter, otherwise
                should be two.
        """
        axis = self.axis

        # Case of only one argument where W is a learnt parameter.
        if hasattr(self, 'W'):
            if chainer.is_debug():
                assert len(xs) == 1
            x, = xs
            W = self.W
            z = scale.scale(x, W, axis)
        # Case of two arguments where W is given as an argument.
        else:
            if chainer.is_debug():
                assert len(xs) == 2
            x, y = xs
            z = scale.scale(x, y, axis)

        # Forward propagate bias term if given.
        if hasattr(self, 'bias'):
            return self.bias(z)
        else:
            return z
Exemplo n.º 2
0
    def multi_node_mean(self, array_a, array_b):
        # The name is allreduce but actually a mean
        # Sigma(a, all-procs)/n -> b or
        # Sigma(b, all-procs)/n -> b if array_a is None
        if chainer.is_debug():
            self.check_ready_to_allreduce(array_a, array_b)

        is_float16 = array_b.dtype == numpy.float16
        if array_a is None:
            buffer_a = mpi4py.MPI.IN_PLACE
        elif is_float16:
            assert array_a.dtype == array_b.dtype
            buffer_a = _memory_utility.array_to_buffer_object(
                array_a.astype(numpy.float32))
        else:
            buffer_a = _memory_utility.array_to_buffer_object(array_a)

        if is_float16:
            array_b32 = array_b.astype(numpy.float32)
        else:
            array_b32 = array_b
        buffer_b = _memory_utility.array_to_buffer_object(array_b32)

        self.mpi_comm.Allreduce(buffer_a, buffer_b)

        if is_float16:
            xp = chainer.backend.get_array_module(array_b)
            xp.copyto(array_b, array_b32.astype(numpy.float16), casting='no')

        array_b *= 1.0 / self.mpi_comm.size

        if chainer.is_debug():
            self.ensure_all_finite(array_b)
Exemplo n.º 3
0
    def forward(self, inputs):
        xp = backend.get_array_module(*inputs)
        y, t = inputs
        # numpy.bincount requires int32 on Windows
        t = t.astype('i', copy=False)

        if self.label_num is None:
            label_num = xp.amax(t) + 1
        else:
            label_num = self.label_num
            if chainer.is_debug():
                assert (t < label_num).all()

        mask = (t == self.ignore_label).ravel()
        pred = xp.where(mask, label_num, y.argmax(axis=1).ravel())
        true = xp.where(mask, label_num, t.ravel())
        support = xp.bincount(true, minlength=label_num + 1)[:label_num]
        relevant = xp.bincount(pred, minlength=label_num + 1)[:label_num]
        tp_mask = xp.where(pred == true, true, label_num)
        tp = xp.bincount(tp_mask, minlength=label_num + 1)[:label_num]

        precision = tp / relevant
        recall = tp / support
        fbeta = _fbeta_score(precision, recall, self.beta)

        return precision, recall, fbeta, support
Exemplo n.º 4
0
    def forward(self, inputs):
        xp = cuda.get_array_module(inputs[0])
        self.input_length = inputs[0]
        label_length = inputs[1]
        t = inputs[2]
        xs = inputs[3:]

        if chainer.is_debug():
            # Batch size check.
            assert len(xs[0]) == len(t)
            assert len(xs[0]) == len(self.input_length)
            assert len(xs[0]) == len(label_length)

            # Length check.
            assert len(xs) >= xp.max(self.input_length)
            assert len(t[0]) >= xp.max(label_length)

        self.path_length = 2 * label_length + 1

        yseq_shape = (len(xs),) + xs[0].shape
        self.yseq = _softmax(xp.vstack(xs).reshape(yseq_shape), xp)
        log_yseq = self.log_matrix(self.yseq, xp)
        self.path = _label_to_path(t, self.blank_symbol, xp)
        self.prob_trans = self.calc_trans(
            log_yseq, self.input_length, t,
            label_length, self.path, self.path_length, xp)

        loss = -_logsumexp(self.prob_trans[0], xp, axis=1)
        if self.reduce == 'mean':
            loss = utils.force_array(xp.mean(loss))
        return loss,
Exemplo n.º 5
0
Arquivo: ctc.py Projeto: ktnyt/chainer
    def forward(self, inputs):
        xp = backend.get_array_module(inputs[0])
        self.input_length, label_length, t, xs = inputs

        if self.zero_padding is None:
            if xs.dtype == numpy.float16:
                self.zero_padding = -10000.0
            else:
                self.zero_padding = -10000000000.0

        if chainer.is_debug():
            assert len(xs) >= xp.max(self.input_length)
            assert t.shape[1] >= xp.max(label_length)

        self.path_length = 2 * label_length + 1

        self.yseq = _softmax(xs, xp)
        log_yseq = self.log_matrix(self.yseq, xp)
        self.path = _label_to_path(t, self.blank_symbol, xp)
        self.prob_trans = self.calc_trans(
            log_yseq, self.input_length, t,
            label_length, self.path, self.path_length, xp)

        loss = -_logsumexp(self.prob_trans[0], xp, axis=1)
        if self.reduce == 'mean':
            loss = utils.force_array(xp.mean(loss))
        return loss,
Exemplo n.º 6
0
 def forward_gpu(self, x):
     self.retain_outputs((0,))
     invx, info = _inv_gpu(x[0])
     if chainer.is_debug():
         if cuda.cupy.any(info != 0):
             raise ValueError('Input has singular matrices.')
     return invx,
Exemplo n.º 7
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = log_softmax._log_softmax(x, self.use_cudnn)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(
                self.class_weight.reshape(shape), x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out',
            't == -1 ? T(0) : log_y[_j * n_channel + t]',
            'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
        )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
        return ret,
Exemplo n.º 8
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = softmax_log(x, self.use_cudnn)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if getattr(self, "normalize", True):
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            "S t, raw T log_y, int32 n_channel, raw T coeff",
            "T out",
            "t == -1 ? T(0) : log_y[_j * n_channel + t]",
            "a + b",
            "out = a * -coeff[0]",
            "0",
            "crossent_fwd",
        )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
        return (ret,)
Exemplo n.º 9
0
    def forward(self, inputs):
        x, inds = inputs

        if chainer.is_debug():
            _check_indices(inds)

        return self._permutate(x, inds, self.inv),
Exemplo n.º 10
0
 def setUp(self):
     self.original_debug = chainer.is_debug()
     chainer.set_debug(True)
     self.one = numpy.array(1, numpy.float32)
     self.f = chainer.FunctionNode()
     self.return_value = tuple(None if x is None else chainer.Variable(x)
                               for x in self.return_data)
Exemplo n.º 11
0
    def forward_cpu(self, inputs):
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = numpy.exp(log_y)
        if self.class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= _broadcast_to(self.class_weight.reshape(shape), x.shape)
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)
        log_p = log_yd[numpy.maximum(t.ravel(), 0), numpy.arange(t.size)]

        log_p *= (t.ravel() != self.ignore_label)
        if self.reduce == 'mean':
            # deal with the case where the SoftmaxCrossEntropy is
            # unpickled from the old version
            if self.normalize:
                count = (t != self.ignore_label).sum()
            else:
                count = len(x)
            self._coeff = 1.0 / max(count, 1)

            y = log_p.sum(keepdims=True) * (-self._coeff)
            return y.reshape(()),
        else:
            return -log_p.reshape(t.shape),
Exemplo n.º 12
0
    def forward(self, inputs):
        x, W = inputs
        if chainer.is_debug():
            if not ((0 <= x).all() and (x < len(W)).all()):
                msg = "Each `x` value need to satisfty `0 <= x < len(W)`"
                raise ValueError(msg)

        return (W.take(x, axis=0),)
Exemplo n.º 13
0
def _double_backward_softmax_cross_entropy(x, t, normalize, class_weight,
                                           ignore_label, reduce, is_chainerx):
    if isinstance(t, variable.Variable):
        t = t.data

    F = chainer.functions

    _check_class_weight_option(class_weight)
    _check_reduce_option(reduce)
    if chainer.is_debug():
        _check_input_values(x, t, ignore_label)

    loss = -chainer.functions.log_softmax(x)

    if class_weight is not None:
        shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
        class_weight = F.broadcast_to(class_weight.reshape(shape), x.shape)
        # TODO(niboshi): Remove this workaround after ChainerX supports
        # type promotion.
        if is_chainerx:
            class_weight = F.cast(class_weight, x.dtype)
        loss = loss * class_weight

    in_use = (t != ignore_label).astype(x.dtype)

    loss = F.rollaxis(loss, 1, loss.ndim)
    loss = F.reshape(loss, (-1, loss.shape[-1]))

    # Replace ignore_label value with one valid for F.select_item below.
    t = t.clip(0, loss.shape[1] - 1)

    loss = F.select_item(loss, t.ravel())
    loss = F.reshape(loss, t.shape)

    loss = loss * in_use

    if reduce == 'mean':
        reduc_dtype = _reduction_dtype(x.dtype)
        if normalize:
            # TODO(niboshi): Use in_use.sum(dtype=reduc_dtype) once chainerx
            # supports dtype argument.
            count = in_use.astype(reduc_dtype, copy=False).sum()
        else:
            count = len(x)
        count = max(count, 1.)

        if reduc_dtype == loss.dtype:
            loss = F.sum(loss / count)
        else:
            # Sum in a promoted dtype
            loss = F.cast(loss, reduc_dtype)
            loss = F.sum(loss / count)
            loss = F.cast(loss, x.dtype)

    return loss
Exemplo n.º 14
0
    def forward_gpu(self, inputs):
        class_weight = backend.from_chainerx(self.class_weight)

        self.retain_inputs((0, 1))
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)
        if class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape)
        if self.normalize:
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        if self.reduce == 'mean':
            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw T coeff, '
                'S ignore_label',
                'T out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1],
              self._coeff, self.ignore_label)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out',
                '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''',
                'softmax_crossent_no_reduce_fwd'
            )(t, log_y.reduced_view(), log_y.shape[-1], self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
Exemplo n.º 15
0
 def __init__(self, indices_or_sections, axis):
     if not isinstance(
             indices_or_sections,
             six.integer_types + (collections.Iterable,)):
         raise TypeError('indices_or_sections must be integer or 1-D array')
     if (chainer.is_debug() and
             isinstance(indices_or_sections, collections.Iterable)):
         for p, n in six.moves.zip(
                 indices_or_sections, indices_or_sections[1:]):
             if p > n:
                 raise ValueError('indices_or_sections must be sorted')
     self.indices_or_sections = indices_or_sections
     self.axis = axis
Exemplo n.º 16
0
Arquivo: bias.py Projeto: 2php/chainer
    def __call__(self, *xs):
        """Applies broadcasted elementwise summation.

        Args:
            xs (list of ~chainer.Variable): Input variables whose length should
                be one if the link has a learnable bias parameter, otherwise
                should be two.
        """
        axis = self.axis

        # Case of only one argument where b is a learnt parameter.
        if hasattr(self, 'b'):
            if chainer.is_debug():
                assert len(xs) == 1
            x, = xs
            b = self.b
            return bias.bias(x, b, axis)
        # Case of two arguments where b is given as an argument.
        else:
            if chainer.is_debug():
                assert len(xs) == 2
            x, y = xs
            return bias.bias(x, y, axis)
Exemplo n.º 17
0
def backprop_step(
        func, target_input_indexes, grad_outputs, grad_inputs):
    """Accumulates gradients of a FunctionNode

    This routine is used by :meth:`chainer.Variable.backward` and
    :func:`chainer.grad`.

    Args:
        target_input_indexes (tuple of int): Sorted indices of the input
            variables w.r.t. which the gradients are required. It is
            guaranteed that this tuple contains at least one element.
        grad_outputs (tuple of Variable): Gradients w.r.t. the output
            variables. If the gradient w.r.t. an output variable is not
            given, the corresponding element is ``None``.
        grad_inputs (dict): References of radients w.r.t. the input variables.

    """
    if chainer.is_debug():
        assert isinstance(target_input_indexes, tuple)
        assert target_input_indexes == tuple(sorted(target_input_indexes))
        assert isinstance(grad_outputs, tuple)
    if func.backward_accumulate.__code__ \
            is not chainer.FunctionNode.backward_accumulate.__code__:
        # backward_accumulate is overridden
        grad_inputs_tuple = tuple([
            _pop_or_none(grad_inputs[func.inputs[i]])
            for i in target_input_indexes
        ])
        gxs = func.backward_accumulate(
            target_input_indexes, grad_outputs, grad_inputs_tuple)
    else:  # otherwise, backward should be overridden
        gxs = func.backward(
            target_input_indexes, grad_outputs)
        len_gxs = len(gxs)
        if len_gxs == len(func.inputs):
            gxs = tuple([gxs[i] for i in target_input_indexes])
        elif len_gxs != len(target_input_indexes):
            raise ValueError(
                'number of gradients returned by %s (%s) is incorrect.'
                % (func._impl_name, func.label))

    for i, gx in six.moves.zip(target_input_indexes, gxs):
        if gx is not None:
            grad_inputs[func.inputs[i]].append(gx)

    if not func.lazy_grad_sum:
        for gx in grad_inputs.values():
            _reduce(gx)
Exemplo n.º 18
0
    def forward(self, inputs):
        x, W = inputs

        if chainer.is_debug():
            if not ((0 <= x).all() and
                    (x < len(W)).all()):
                msg = 'Each `x` value need to satisfy `0 <= x < len(W)`'
                raise ValueError(msg)

        if self.ignore_label is not None:
            xp = cuda.get_array_module(*inputs)
            mask = (x == self.ignore_label)
            return xp.where(
                mask[..., None], 0, W.take(xp.where(mask, 0, x), axis=0)),

        return W.take(x, axis=0),
Exemplo n.º 19
0
    def __init__(self, slices):
        if not isinstance(slices, collections.Iterable):
            slices = tuple([slices])

        if chainer.is_debug():
            n_ellipses = 0
            for s in slices:
                if numpy.isscalar(s) or s is None or isinstance(s, slice):
                    pass
                elif s is Ellipsis:
                    n_ellipses += 1
                else:
                    raise ValueError("Only basic indexing is supported")
            if n_ellipses > 1:
                raise ValueError("Only one Ellipsis is allowed")

        self.slices = slices
Exemplo n.º 20
0
    def __init__(self, slices):
        if isinstance(slices, list):
            if all([isinstance(s, int) for s in slices]):
                slices = slices,
            slices = tuple(slices)
        elif not isinstance(slices, tuple):
            slices = slices,

        if chainer.is_debug():
            n_ellipses = 0
            for s in slices:
                if s is Ellipsis:
                    n_ellipses += 1
            if n_ellipses > 1:
                raise ValueError('Only one Ellipsis is allowed')

        self.slices = slices
Exemplo n.º 21
0
    def forward(self, inputs):
        x, W = inputs

        xp = cuda.get_array_module(*inputs)
        if chainer.is_debug():
            valid_x = xp.logical_and(0 <= x, x < len(W))
            if self.ignore_label is not None:
                valid_x = xp.logical_or(valid_x, x == self.ignore_label)
            if not valid_x.all():
                raise ValueError('Each not ignored `x` value need to satisfy'
                                 '`0 <= x < len(W)`')

        if self.ignore_label is not None:
            mask = (x == self.ignore_label)
            return xp.where(
                mask[..., None], 0, W.take(xp.where(mask, 0, x), axis=0)),

        return W.take(x, axis=0),
Exemplo n.º 22
0
def bias(x, y, axis=1):
    """Elementwise summation with broadcasting.

    Computes a elementwise summation of two input variables, with the shape of
    the latter variable broadcasted to match the shape of the former. ``axis``
    is the first axis of the first variable along which the second variable is
    applied.

    The term "broadcasting" here comes from Caffe's bias layer so the
    "broadcasting" with the following arguments::

           x : 100 x 3 x 40 x 5 x 6
           y : 3 x 40
        axis : 1

    is equivalent to the following numpy broadcasting::

        x : 100 x  3 x 40 x 5 x 6
        y :  (1 x) 3 x 40 x 1 x 1

    Note that the axis of ``x`` to which we apply ``y`` is specified by the
    argument ``axis``, whose meaning is different from numpy's ``axis``.

    Args:
        x (:class:`~chainer.Variable` or :ref:`ndarray`):
            Input variable to be summed.
        y (:class:`~chainer.Variable` or :ref:`ndarray`):
            Input variable to sum, broadcasted.
        axis (int): The first axis of ``x`` along which ``y`` is applied.

    Returns:
        ~chainer.Variable: Output variable.

    """
    x_shape = x.shape
    y_shape = y.shape
    if chainer.is_debug():
        assert x_shape[axis:axis + len(y_shape)] == y_shape
    y1_shape = tuple([1] * axis + list(y_shape) +
                     [1] * (len(x_shape) - axis - len(y_shape)))
    y1 = reshape.reshape(y, y1_shape)
    y2 = broadcast.broadcast_to(y1, x_shape)
    return x + y2
Exemplo n.º 23
0
 def forward_cpu(self, inputs):
     x, t = inputs
     if chainer.is_debug():
         self._check_input_values(x, t)
     
     log_y = numpy.log(x)
     if self.cache_score:
         self.y = x
     log_yd = numpy.rollaxis(log_y, 1)
     log_yd = log_yd.reshape(len(log_yd), -1)
     log_p = log_yd[numpy.maximum(t.ravel(), 0), six.moves.range(t.size)]
     if getattr(self, 'normalize', True):
         count = (t != self.ignore_label).sum()
     else:
         count = len(x)
     self._coeff = 1.0 / max(count, 1)
     y = (log_p * (t.ravel() != self.ignore_label)).sum(keepdims=True) \
         * (-self._coeff)
     return y.reshape(()),
Exemplo n.º 24
0
def _get_indices_or_sections(indices_or_sections):
    """Checks and convert ``indices_or_sections`` argument

    Converted value is one of: 1-D numpy.ndarray, list, int, and
    NumPy int scalar.

    Returns:
        A binary tuple in which the 1st element is indices (sequence) and
        the 2nd element is sections (scalar).
        Only one of the two is not ``None`` and the other is ``None``.

    """
    ios = indices_or_sections
    is_seq = False
    if isinstance(ios, numpy.ndarray):
        # numpy.ndarray
        if ios.dtype.kind != 'i' and ios.size > 0:
            # Note: numpy.array([]) (dtype is float64) should be accepted.
            raise TypeError('indices_or_sections must be integers')
        if ios.ndim >= 2:
            raise TypeError('indices_or_sections must be 1-D sequence')
        is_seq = ios.ndim != 0
    elif isinstance(ios, collections_abc.Sequence):
        # Any sequence except numpy.ndarray
        ios = list(ios)
        is_seq = True
    elif isinstance(indices_or_sections, six.integer_types):
        # int
        pass
    else:
        raise TypeError(
            'indices_or_sections must be integer or 1-D array.\n'
            'Actual: {}'.format(type(indices_or_sections)))

    if is_seq and chainer.is_debug():
        for p, n in six.moves.zip(ios, ios[1:]):
            if p > n:
                raise ValueError('indices_or_sections must be sorted')

    if is_seq:
        return ios, None
    else:
        return None, ios
Exemplo n.º 25
0
    def forward(self, inputs):
        self.retain_inputs((0,))
        x, W = inputs
        self._w_shape = W.shape

        xp = backend.get_array_module(*inputs)
        if chainer.is_debug():
            valid_x = xp.logical_and(0 <= x, x < len(W))
            if self.ignore_label is not None:
                valid_x = xp.logical_or(valid_x, x == self.ignore_label)
            if not valid_x.all():
                raise ValueError('Each not ignored `x` value need to satisfy'
                                 '`0 <= x < len(W)`')

        if self.ignore_label is not None:
            mask = (x == self.ignore_label)
            return xp.where(mask[..., None], 0, W[xp.where(mask, 0, x)]),

        return W[x],
Exemplo n.º 26
0
def _double_backward_softmax_cross_entropy(x, t, normalize, class_weight,
                                           ignore_label, reduce):
    if isinstance(t, variable.Variable):
        t = t.data

    _check_class_weight_option(class_weight)
    _check_reduce_option(reduce)
    if chainer.is_debug():
        _check_input_values(x, t, ignore_label)

    loss = -chainer.functions.log_softmax(x)

    if class_weight is not None:
        shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
        class_weight = chainer.functions.broadcast_to(
            class_weight.reshape(shape), x.shape)
        loss = loss * class_weight

    in_use = (t != ignore_label).astype(x.dtype)

    loss = chainer.functions.rollaxis(loss, 1, loss.ndim)
    loss = chainer.functions.reshape(loss, (-1, loss.shape[-1]))

    # Replace ignore_label value with one valid for F.select_item below.
    t = t.clip(0, loss.shape[1] - 1)

    loss = chainer.functions.select_item(loss, t.ravel())
    loss = chainer.functions.reshape(loss, t.shape)

    loss = loss * in_use

    if reduce == 'mean':
        if normalize:
            count = in_use.sum()
        else:
            count = len(x)
        count = max(count, 1.)
        loss = loss / count
        return chainer.functions.sum(loss)
    else:
        return loss
Exemplo n.º 27
0
    def forward_gpu(self, inputs):
        cupy = cuda.cupy
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = softmax_log(x, self.use_cudnn)
        self.y = cupy.exp(log_y)
        if getattr(self, 'normalize', True):
            coeff = cupy.maximum(1, (t != self.ignore_label).sum())
        else:
            coeff = max(1, len(t))
        self._coeff = cupy.divide(1.0, coeff, dtype=x.dtype)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)
        ret = cuda.reduce(
            'S t, raw T log_y, int32 n_channel, raw T coeff', 'T out',
            't == -1 ? 0 : log_y[_j * n_channel + t]',
            'a + b', 'out = a * -coeff[0]', '0', 'crossent_fwd'
        )(t, log_y.reduced_view(), log_y.shape[-1], self._coeff)
        return ret,
Exemplo n.º 28
0
    def forward(self, inputs):
        x, t = inputs
        if chainer.is_debug():
            if not ((0 <= t).all() and
                    (t < x.shape[1]).all()):
                msg = 'Each label `t` need to satisfty `0 <= t < x.shape[1]`'
                raise ValueError(msg)

        xp = cuda.get_array_module(x)
        if xp is numpy:
            # This code is equivalent to `t.choose(x.T)`, but `numpy.choose`
            # does not work when `x.shape[1] > 32`.
            return x[six.moves.range(t.size), t],
        else:
            y = cuda.elementwise(
                'S t, raw T x',
                'T y',
                'int ind[] = {i, t}; y = x[ind];',
                'getitem_fwd'
            )(t, x)
            return y,
Exemplo n.º 29
0
def scale(x, y, axis=1):
    """Elementwise product with broadcasting.

    Computes a elementwise product of two input variables, with the shape of
    the latter variable broadcasted to match the shape of the former. ``axis``
    is the first axis of the first variable along which the second variable is
    applied.

    The term "broadcasting" here comes from Caffe's scale layer so the
    "broadcasting" with the following arguments::

           x : 100 x 3 x 40 x 60
           y : 3 x 40
        axis : 1

    is equivalent to the following numpy broadcasting::

        x : 100 x 3 x 40 x 60
        y :   1 x 3 x 40 x 1

    Note that how the ``axis`` indicates to which axis of ``x`` we apply ``y``.

    Args:
        x (~chainer.Variable): Input variable to be scaled.
        y (~chainer.Variable): Input variable to scale, broadcasted.
        axis (int): The first axis of ``x`` along which ``y`` is applied.

    Returns:
        ~chainer.Variable: Output variable.

    """
    x_shape = x.shape
    y_shape = y.shape
    if chainer.is_debug():
        assert x_shape[axis:axis + len(y_shape)] == y_shape
    y1_shape = tuple([1] * axis + list(y_shape) +
                     [1] * (len(x_shape) - axis - len(y_shape)))
    y1 = reshape.reshape(y, y1_shape)
    y2 = broadcast.broadcast_to(y1, x_shape)
    return x * y2
Exemplo n.º 30
0
    def forward_cpu(self, inputs):
        x, t = inputs
        if chainer.is_debug():
            self._check_input_values(x, t)

        log_y = softmax_log(x, False)
        if self.cache_score:
            self.y = numpy.exp(log_y)
        log_yd = numpy.rollaxis(log_y, 1)
        log_yd = log_yd.reshape(len(log_yd), -1)

        log_p = log_yd[numpy.maximum(t.ravel(), 0), six.moves.range(t.size)]
        # deal with the case where the SoftmaxCrossEntropy is
        # unpickled from the old version
        if getattr(self, "normalize", True):
            count = (t != self.ignore_label).sum()
        else:
            count = len(x)
        self._coeff = 1.0 / max(count, 1)

        y = (log_p * (t.ravel() != self.ignore_label)).sum(keepdims=True) * (-self._coeff)
        return (y.reshape(()),)
Exemplo n.º 31
0
    def apply(self, inputs):
        """Computes output variables and grows the computational graph.

        Basic behavior is expressed in the documentation of
        :class:`FunctionNode`.

        .. note::

           If the :data:`~Variable.data` attribute of input variables exist on
           a GPU device, that device is made current before calling
           :meth:`forward`, so implementors do not need to take care of device
           selection in most cases.

        Args:
            inputs: Tuple of input variables. Each element can be either
                :class:`~chainer.Variable` or :ref:`ndarray`. If the element
                is an ndarray, it is automatically wrapped with
                :class:`~chainer.Variable`.

        Returns:
            A tuple of output :class:`~chainer.Variable` objects.

        """
        chainerx_in_data = None
        chainerx_device = None
        is_chainerx, in_data = _extract_apply_in_data(inputs)

        if is_chainerx:
            # Try ChainerX C++ implementation.
            # If it's supported, the output arrays are wrapped with Variables
            # and returned.
            # If not supported, FunctionNode.forward_chainerx should return
            # Fallback.
            # In that case the input arrays are converted to numpy.ndarray
            # or cupy.ndarray (depending on the ChainerX backend) and
            # forward computation falls back to the conventional
            # FunctionNode.forward() implementaion.
            outputs = self.forward_chainerx(in_data)

            if outputs is not chainer.Fallback:
                # Supported. Wrap with variables and return
                assert isinstance(outputs, tuple)
                return tuple([
                    variable.Variable._init_unchecked(
                        y, requires_grad=y.is_backprop_required(),
                        is_chainerx_array=True)
                    for y in outputs])

            # Fall back to FunctionNode.forward()
            chainerx_in_data, in_data, chainerx_device = (
                self._chainerx_apply_fallback_preprocess(in_data, inputs))
            self._is_chainerx_fallback_mode = True
            self.chainerx_device = chainerx_device

        utils._check_arrays_forward_compatible(in_data, self.label)

        is_debug = chainer.is_debug()
        if is_debug:
            # Keep stack trace for debug
            self.stack = traceback.extract_stack()

        if configuration.config.type_check:
            self._check_data_type_forward(in_data)

        hooks = chainer.get_function_hooks()
        if self._n_local_function_hooks > 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(self.local_function_hooks)
        hooks = hooks.values()  # avoid six for performance

        for hook in hooks:
            hook.forward_preprocess(self, in_data)

        # Forward propagation
        with cuda.get_device_from_array(*in_data):
            self._input_indexes_to_retain = None
            self._output_indexes_to_retain = None
            if chainer.config.schedule_func is not None:
                outputs = static_forward_optimizations(self, in_data)
            elif self._is_chainerx_fallback_mode:
                # In ChainerX fallback, __class__ is temporarily replaced with
                # the fabricated one with automatic attirbute fallback.
                with _chainerx_attribute_fallback(self, chainerx_device):
                    outputs = self.forward(in_data)
            else:
                # In normal case, simply run the forward method.
                outputs = self.forward(in_data)

        # Check for output array types
        if not isinstance(outputs, tuple):
            raise TypeError(
                'forward output must be a tuple ({})\n'
                'Actual: {}'.format(self.label, type(outputs)))

        if not chainer.is_arrays_compatible(outputs):
            raise TypeError(
                'incompatible array types are mixed in the forward output '
                '({}).\n'
                'Actual: {}'.format(
                    self.label,
                    ', '.join(str(type(x)) for x in outputs)))

        for hook in hooks:
            hook.forward_postprocess(self, in_data)

        # NaN check of output values
        if is_debug:
            if any(chainer.backend._contains_nan(out)
                   for out in outputs):
                msg = ('NaN is detected on forward computation of '
                       '{}'.format(self.label))
                raise RuntimeError(msg)

        self._output_count = len(outputs)

        if self._is_chainerx_fallback_mode:
            ret = self._chainerx_apply_fallback_postprocess(
                chainerx_in_data, inputs, outputs)

        else:
            input_vars = [chainer.as_variable(x) for x in inputs]
            requires_grad = any([x.requires_grad for x in input_vars])

            ret = tuple(
                [variable.Variable(y, requires_grad=requires_grad)
                 for y in outputs])

            if configuration.config.enable_backprop:
                # Topological ordering
                self.rank = max(
                    [x.rank for x in input_vars]) if input_vars else 0
                # Add backward edges
                for y in ret:
                    y.creator_node = self
                self.inputs = tuple([x.node for x in input_vars])
                # Add forward edges (must be weak references)
                self.outputs = tuple([weakref.ref(y.node) for y in ret])

                if self._input_indexes_to_retain is not None:
                    for index in self._input_indexes_to_retain:
                        input_vars[index].retain_data()

                if self._output_indexes_to_retain is not None:
                    retained_data = []
                    for index in self._output_indexes_to_retain:
                        ret[index].retain_data()
                        retained_data.append(outputs[index])
                    self._retained_output_data = tuple(retained_data)

                self.lazy_grad_sum = configuration.config.lazy_grad_sum

        return ret
Exemplo n.º 32
0
 def backward(self, axis, gamma, gy, x, xp, expander, mean, inv_std, eps,
              var):
     return cudnn.batch_normalization_backward(x, gamma, gy, mean, inv_std,
                                               eps, self.is_for_conv2d,
                                               self.cudnn_mode,
                                               chainer.is_debug())
Exemplo n.º 33
0
 def setUp(self):
     self.link = links.EmbedID(2, 2, ignore_label=self.ignore_label)
     self.t = numpy.array([self.t_value], dtype=numpy.int32)
     self.original_debug = chainer.is_debug()
     chainer.set_debug(True)
Exemplo n.º 34
0
    def backward(self, retain_grad=False):
        """Runs error backpropagation (a.k.a. backprop) from this variable.

        On backprop, :meth:`Function.backward` is called on each
        :class:`Function` object appearing in the backward graph starting from
        this variable. The backward graph is represented by backward references
        from variables to their creators, and from functions to their inputs.
        The backprop stops at all root variables. Some functions set ``None``
        as gradients of some inputs, where further backprop does not take place
        at such input variables.

        This method uses :data:`grad` as the initial error array. User can
        manually set a gradient array before calling this method. If
        :data:`data` contains only one element (i.e., it is scalar) and
        :data:`grad` is ``None``, then this method automatically complements
        1.0 as the initial error. This is useful on starting backprop from
        some scalar loss value.

        Args:
            retain_grad (bool): If ``True``, the gradient arrays of all
                intermediate variables are kept. Otherwise, :data:`grad` of the
                intermediate variables are set to ``None`` on appropriate
                timing, which may reduce the maximum memory consumption.

                In most cases of training some models, the purpose of backprop
                is to compute gradients of parameters, not of variables, so it
                is recommended to set this flag ``False``.

        """
        if self.creator is None:
            return
        initial_device = None
        if cuda.available and isinstance(self.data, cuda.cupy.ndarray):
            try:
                initial_device = cuda.Device()
            except cuda.cupy.cuda.runtime.CUDARuntimeError as e:
                if e.status != 38:  # cudaErrorNoDevice
                    raise

        is_debug = chainer.is_debug()

        cand_funcs = []
        seen_set = set()
        seen_vars = set()
        need_copy = set()

        # Initialize error by 1, if this is a loss variable
        if self.data.size == 1 and self.grad is None:
            with cuda.get_device(self.data) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.data)
                else:
                    self.grad = cuda.cupy.ones_like(self.data)

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator)

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            outputs = [y() for y in func.outputs]  # access via weak ref

            in_data = tuple([x.data for x in func.inputs])
            out_grad = tuple([None if y is None else y.grad for y in outputs])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)

            cuda.get_device(*(in_data + out_grad)).use()
            for hook in six.itervalues(hooks):
                hook.backward_preprocess(func, in_data, out_grad)
            gxs = func.backward(in_data, out_grad)
            assert len(gxs) == len(in_data)
            for hook in six.itervalues(hooks):
                hook.backward_postprocess(func, in_data, out_grad)

            if is_debug:
                for gx in gxs:
                    if gx is None:
                        continue
                    cuda.get_device(gx).use()
                    if cuda.get_array_module(gx).isnan(gx).any():
                        msg = 'NaN is detected on backward computation'
                        raise RuntimeError(msg)

            if not retain_grad:
                for y in outputs:
                    if y is not None and y is not self:
                        y.grad = None
            for x, gx in zip(func.inputs, gxs):
                if gx is None:
                    continue

                _check_grad_type(func, x, gx)

                # Accumulate the gradient to x. It is a bit tricky to handle
                # branches and parameter gradient accumulation correctly.
                id_x = id(x)
                if x.creator is None:  # leaf
                    if x._grad is None:
                        x.grad = gx
                        need_copy.add(id_x)
                    else:
                        cuda.get_device(gx).use()
                        if id_x in need_copy:
                            x.grad = utils.force_array(x.grad + gx)  # copy
                            need_copy.remove(id_x)
                        else:
                            x._grad += gx
                else:  # not a leaf
                    add_cand(x.creator)
                    if id_x not in seen_vars:  # 1st visit
                        x.grad = gx
                        seen_vars.add(id_x)
                        need_copy.add(id_x)
                    else:
                        cuda.get_device(gx).use()
                        if id_x in need_copy:  # 2nd visit
                            x._grad = utils.force_array(gx + x._grad)  # copied
                            need_copy.remove(id_x)
                        else:  # 3rd or later visit
                            x._grad += gx
            del gxs  # to reduce memory usage
            if initial_device is not None:
                initial_device.use()
Exemplo n.º 35
0
    def apply(self, inputs):
        """Computes output variables and grows the computational graph.

        Basic behavior is expressed in the documentation of
        :class:`FunctionNode`.

        .. note::

           If the :data:`~Variable.data` attribute of input variables exist on
           a GPU device, that device is made current before calling
           :meth:`forward`, so implementors do not need to take care of device
           selection in most cases.

        Args:
            inputs: Tuple of input variables. Each element can be either
                :class:`~chainer.Variable`, :class:`numpy.ndarray`,
                or :class:`cupy.ndarray`. If the element is an ndarray, it is
                automatically wrapped with :class:`~chainer.Variable`.

        Returns:
            A tuple of output :class:`~chainer.Variable` objects.

        """
        input_vars = [chainer.as_variable(x) for x in inputs]
        in_data = tuple([x.data for x in input_vars])
        requires_grad = any([x.requires_grad for x in input_vars])

        # Check for input array types
        if not chainer.is_arrays_compatible(in_data):
            raise TypeError(
                'incompatible array types are mixed in the forward input '
                '({}).\n'
                'Actual: {}'.format(self.label,
                                    ', '.join(str(type(x)) for x in in_data)))

        is_debug = chainer.is_debug()
        if is_debug:
            # Keep stack trace for debug
            self.stack = traceback.extract_stack()

        if configuration.config.type_check:
            self._check_data_type_forward(in_data)

        hooks = chainer.get_function_hooks()
        if self._n_local_function_hooks > 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(self.local_function_hooks)
        hooks = hooks.values()  # avoid six for performance

        for hook in hooks:
            hook.forward_preprocess(self, in_data)

        # Forward propagation
        with cuda.get_device_from_array(*in_data):
            self._input_indexes_to_retain = None
            self._output_indexes_to_retain = None
            if chainer.config.schedule_func is not None:
                outputs = static_forward_optimizations(self, in_data)
            else:
                outputs = self.forward(in_data)

        # Check for output array types
        if not isinstance(outputs, tuple):
            raise TypeError('forward output must be a tuple ({})\n'
                            'Actual: {}'.format(self.label, type(outputs)))

        if not chainer.is_arrays_compatible(outputs):
            raise TypeError(
                'incompatible array types are mixed in the forward output '
                '({}).\n'
                'Actual: {}'.format(self.label,
                                    ', '.join(str(type(x)) for x in outputs)))

        for hook in hooks:
            hook.forward_postprocess(self, in_data)

        # NaN check of output values
        if is_debug:
            if any(chainer.backend._contains_nan(out) for out in outputs):
                msg = ('NaN is detected on forward computation of '
                       '{}'.format(self.label))
                raise RuntimeError(msg)

        ret = tuple([
            variable.Variable(y, requires_grad=requires_grad) for y in outputs
        ])

        if configuration.config.enable_backprop:
            # Topological ordering
            self.rank = max([x.rank for x in input_vars]) if input_vars else 0
            # Add backward edges
            for y in ret:
                y.creator_node = self
            self.inputs = tuple([x.node for x in input_vars])
            # Add forward edges (must be weak references)
            self.outputs = tuple([weakref.ref(y.node) for y in ret])

            if self._input_indexes_to_retain is not None:
                for index in self._input_indexes_to_retain:
                    input_vars[index].retain_data()

            if self._output_indexes_to_retain is not None:
                retained_data = []
                for index in self._output_indexes_to_retain:
                    ret[index].retain_data()
                    retained_data.append(outputs[index])
                self._retained_output_data = tuple(retained_data)

            self.lazy_grad_sum = configuration.config.lazy_grad_sum

        return ret
Exemplo n.º 36
0
 def setUp(self):
     self.x = numpy.random.uniform(-1, 1, (2, 2)).astype(numpy.float32)
     # `0` is required to avoid NaN
     self.t = numpy.array([self.t_value, 0], dtype=numpy.int32)
     self.original_debug = chainer.is_debug()
     chainer.set_debug(True)
Exemplo n.º 37
0
 def setUp(self):
     self.x = numpy.random.uniform(-1, 1, (1, 2)).astype(numpy.float32)
     self.t = numpy.array([self.t_value], dtype=numpy.int32)
     self.original_debug = chainer.is_debug()
     chainer.set_debug(True)
Exemplo n.º 38
0
    def forward(self, inputs):
        x, t = inputs[:2]
        rest = len(inputs) - 2
        head_W, Ws = inputs[2], inputs[3:2 + (rest - 1) // 2 + 1]
        Rs = inputs[2 + (rest - 1) // 2 + 1:]
        n_tails = len(Rs)
        # minus_inf = -1024.
        minus_inf = -numpy.inf
        xp = cuda.get_array_module(x)

        if chainer.is_debug():
            _check_input_values(x, t, self.ignore_label)

        self.retain_inputs(tuple(six.moves.range(len(inputs))))

        cluster_hots = []
        for i in six.moves.range(1, n_tails + 1):
            lower, upper = self.cutoff[i], self.cutoff[i + 1]
            in_cluster = xp.logical_and(lower <= t, t < upper)
            if self.output_all:
                in_cluster = xp.ones(
                    in_cluster.shape, dtype=in_cluster.dtype)
            cluster_hots.append(in_cluster)
        self.cluster_hots = cluster_hots

        self.head = self.linear(x, head_W)
        self.ls_head = log_softmax._log_softmax(self.head)
        self.reduced_xs = []
        self.tails = []
        self.ls_tails = []
        for i, in_cluster in enumerate(cluster_hots, start=1):
            tail_idx = i - 1
            if xp.any(in_cluster):
                reduced_x = self.linear(x[in_cluster], Rs[tail_idx])
                self.reduced_xs.append(reduced_x)
                out = self.linear(reduced_x, Ws[tail_idx])
                self.tails.append(out)
                ls_out = log_softmax._log_softmax(out)
                self.ls_tails.append(ls_out)
            else:
                self.reduced_xs.append(None)
                self.tails.append(None)
                self.ls_tails.append(None)

        n_head_out = head_W.shape[0] - n_tails
        n_out = n_head_out + sum(W.shape[0] for W in Ws)
        shape = (x.shape[0], n_out)

        log_y = xp.full(shape, minus_inf, dtype=x.dtype)

        log_y[:, :n_head_out] = self.ls_head[:, :n_head_out]
        for i, (in_cluster, tail) in enumerate(
                zip(cluster_hots, self.ls_tails), start=1):
            if tail is None:
                continue
            lower, upper = self.cutoff[i], self.cutoff[i + 1]

            tail_main = self.ls_head[:, n_head_out + i - 1]
            tail_main_in = xp.broadcast_to(
                tail_main[in_cluster][:, None], tail.shape)
            log_y[xp.nonzero(in_cluster)[0], lower:upper] = tail_main_in + tail
            not_in_cluster = xp.logical_not(in_cluster)
            log_y[xp.nonzero(not_in_cluster)[0],
                  lower] = tail_main[not_in_cluster]

        return log_y,
Exemplo n.º 39
0
def forward_grad(self, rho=1e-3, decay=0.50, loss_scale=None):
    """test
    """
    self._node._check_old_style_gradient()
    if self.creator_node is None:
        return
    initial_device = None
    if cuda.available and isinstance(self.data, cuda.ndarray):
        try:
            initial_device = cuda.Device()
        except cuda.cupy.cuda.runtime.CUDARuntimeError as e:
            if e.status != 38:  # cudaErrorNoDevice
                raise

    is_debug = chainer.is_debug()

    cand_funcs = []
    seen_set = set()

    def add_cand(cand):
        if cand not in seen_set:
            # Negate since heapq is min-heap
            heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
            seen_set.add(cand)

    add_cand(self.creator_node)

    cur_decay = 1.0
    while cand_funcs:
        _, _, func = heapq.heappop(cand_funcs)
        inputs = func.inputs
        target_input_indexes = [
            i for i, x in enumerate(inputs) if x.requires_grad
        ]
        if not target_input_indexes:
            continue

        in_data = tuple([x.data for x in inputs])
        cuda.get_device_from_array(*in_data).use()
        if hasattr(func, 'with_frad') and func.with_frad:
            gW, gb = func.forward_grad(in_data, rho)
            gxs = [None, Variable(gW * cur_decay), Variable(gb * cur_decay)]
            cur_decay *= decay
        else:
            gxs = [None] * len(inputs)

        if is_debug:
            for gx in gxs:
                if gx is None:
                    continue
                gx_data = gx.data
                if gx_data.dtype.kind == 'f':
                    cuda.get_device_from_array(gx_data).use()
                    if cuda.get_array_module(gx_data).isnan(gx_data).any():
                        raise RuntimeError(
                            'NaN is detected on forward-grad computation of '
                            '{}'.format(func.label))

        for i, gx in enumerate(gxs):
            x = inputs[i]
            if x.creator_node is not None:
                add_cand(x.creator_node)

            if gx is None:
                continue

            if not x.requires_grad:
                continue

            _check_grad_type(func, x, gx.data)

            x_var = x.get_variable_or_none()
            if x_var is not None:
                x_var._grad_var = gx
                x_var._loss_scale = loss_scale

        del gxs  # to reduce memory usage
        if initial_device is not None:
            initial_device.use()
Exemplo n.º 40
0
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale):
    candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap()

    for y in outputs:
        creator = y.creator_node
        if creator is not None:
            push_candidate(creator)

    input_nodes = set(x.node for x in inputs)
    ret_dict = {}

    is_debug = chainer.is_debug()
    base_hooks = chainer.get_function_hooks().values()
    while candidate_funcs:
        func = pop_candidate()

        # Collect the gradients w.r.t. the outputs
        ys = [y() for y in func.outputs]  # access via weak ref
        gys = tuple([grads.pop(y) for y in ys])

        for node, gy in six.moves.zip(ys, gys):
            if node is not None:
                if node in input_nodes:
                    ret_dict[node] = gy

                if retain_grad:
                    y = node.get_variable_or_none()
                    if y is not None:
                        y.grad_var = gy
                        y._loss_scale = loss_scale

        # Collect the gradients w.r.t. the inputs
        input_indexes = []
        x_grads = collections.OrderedDict()
        for i, x in enumerate(func.inputs):
            if x not in grad_required:
                continue
            input_indexes.append(i)
            if x not in x_grads:
                x_grads[x] = grads.get_as_list(x)
        if not input_indexes:
            continue
        input_indexes = tuple(input_indexes)

        # Do backward

        # Call pre-backward hooks
        if func._n_local_function_hooks != 0:
            local_hooks = collections.OrderedDict(chainer.get_function_hooks())
            local_hooks.update(func.local_function_hooks)
            hooks = local_hooks.values()  # avoid six for performance
        else:
            hooks = base_hooks

        in_data = [x.data for x in func.inputs]
        out_grad_data = [None if g is None else g.data for g in gys]

        with cuda.get_device_from_array(*in_data):
            for hook in hooks:
                hook.backward_preprocess(
                    func, tuple(in_data), tuple(out_grad_data))

            _backprop_utils.backprop_step(func, input_indexes, gys, x_grads,
                                          is_debug)

            # Call post-backward hooks
            for hook in hooks:
                hook.backward_postprocess(
                    func, tuple(in_data), tuple(out_grad_data))

        # Update grads
        for node, g in x_grads.items():
            if not g:  # gradient == None
                continue

            creator = node.creator_node
            if creator is not None:
                push_candidate(creator)

    for x in input_nodes:
        if x not in ret_dict:
            ret_dict[x] = grads.pop(x)
    return ret_dict
Exemplo n.º 41
0
 def setUp(self):
     self.x = numpy.arange(10).reshape((2, 5)).astype('f')
     self.ind = numpy.array(self.indices, 'i')
     self.debug = chainer.is_debug()
     chainer.set_debug(True)
Exemplo n.º 42
0
    def forward(self, inputs):
        self.retain_inputs((0, 1))
        x, gamma, beta = inputs

        xp = backend.get_array_module(x)
        if self.running_mean is None:
            self.running_mean = xp.zeros_like(gamma, dtype=x.dtype)
            self.running_var = xp.zeros_like(gamma, dtype=x.dtype)

        self.axis = _compute_axis(x.ndim, gamma.ndim, self.axis)
        self.key_axis = _compute_key_axis(x.ndim, gamma.ndim, self.axis)

        if all(x.shape[i] == 1 for i in self.axis):
            if 0 in self.axis:
                warnings.warn(
                    'A batch with no more than one sample has been given'
                    ' to F.batch_normalization. F.batch_normalization'
                    ' will always output a zero tensor for such batches.'
                    ' This could be caused by incorrect configuration in'
                    ' your code (such as running evaluation while'
                    ' chainer.config.train=True),'
                    ' but could also happen in the last batch of training'
                    ' if non-repeating iterator is used.', UserWarning)
            else:
                warnings.warn(
                    'F.batch_normalization received a batch with single'
                    ' dimensions along all axes that are used for aggregating'
                    ' statistics. F.batch_normalization'
                    ' will always output a zero tensor for such batches.',
                    UserWarning)

        # TODO(niboshi): Refactor calculation of expander and axis into a
        # function and call it just before they are used.

        # expander inserts singleton dimensions to gamma and beta so that they
        # can be broadcasted with x.
        expander = [None for _ in range(x.ndim)]
        for i in self.key_axis:
            expander[i] = slice(None)
        expander = tuple(expander)
        self.expander = expander

        self.mode = _BNMode(x, gamma, self.key_axis)
        self.use_cudnn = self.mode.can_use_cudnn(xp)
        self.use_ideep = self.mode.can_use_ideep()

        if self.use_ideep:
            # TODO(niboshi): Refactor iDeep part into a separate method
            expand_dim = False
            if x.ndim == 2:
                expand_dim = True
                x = x[:, :, None, None]

            y, self.mean, self.var, self.inv_std = (
                intel64.ideep.batchNormalization.Forward(
                    intel64.ideep.array(x.astype(gamma.dtype, copy=False)),
                    intel64.ideep.array(gamma), intel64.ideep.array(beta),
                    None, None, self.eps))
            y = y.astype(x.dtype, copy=False)

            m = x.size // gamma.size
            adjust = m / max(m - 1., 1.)

            # Update running_mean
            if isinstance(self.running_mean, intel64.ideep.mdarray):
                self.running_mean.inplace_axpby(self.decay, (1 - self.decay),
                                                self.mean)
            else:
                self.running_mean *= self.decay
                self.running_mean += self.mean * (1 - self.decay)

            # Update running_var
            if isinstance(self.running_var, intel64.ideep.mdarray):
                self.running_var.inplace_axpby(self.decay, (1 - self.decay),
                                               self.var * adjust)
            else:
                self.running_var *= self.decay
                self.running_var += self.var * adjust * (1 - self.decay)

            if expand_dim:
                y = numpy.squeeze(y, axis=(2, 3))

        elif self.use_cudnn:
            # self.mean and self.inv_std are used as buffers to save
            # intermediate results computed during forward pass. These buffers
            # are used to speed-up backward pass.
            y, self.mean, self.inv_std = (
                cudnn.batch_normalization_forward_training(
                    x, gamma, beta, self.running_mean, self.running_var, None,
                    None, self.eps, self.decay, self.mode.is_for_conv2d,
                    self.mode.get_cudnn_mode(), chainer.is_debug()))
        else:
            # Generic CPU and GPU implementation

            gamma = gamma[expander]
            beta = beta[expander]
            self.mean = x.mean(axis=self.axis, dtype=gamma.dtype)
            var = x.var(axis=self.axis, dtype=gamma.dtype)
            if xp is numpy:
                self.inv_std = numpy.reciprocal(
                    numpy.sqrt(var + self.eps, dtype=gamma.dtype))
            else:
                self.inv_std = cuda.cupyx.rsqrt(var + self.eps,
                                                dtype=gamma.dtype)
            y = _apply_bn_fwd(xp, x, self.mean[expander],
                              self.inv_std[expander], gamma, beta)
            # Update running statistics
            m = x.size // gamma.size
            adjust = m / max(m - 1., 1.)  # unbiased estimation

            xp = backend.get_array_module(self.running_mean, self.running_var)
            if xp is chainerx:
                self.running_mean, self.running_var = backend.from_chx(
                    (self.running_mean, self.running_var))

            self.running_mean *= self.decay
            self.running_mean += (1 - self.decay) * self.mean
            self.running_var *= self.decay
            self.running_var += (1 - self.decay) * adjust * var

            if xp is chainerx:
                self.running_mean = backend.to_chx(self.running_mean)
                self.running_var = backend.to_chx(self.running_var)

        return y,
Exemplo n.º 43
0
    def _backward_main(self, retain_grad, loss_scale):
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return

        is_debug = chainer.is_debug()

        cand_funcs = []
        seen_set = set()
        grads = _backprop_utils.GradTable(load_if_new=True)

        # Initialize error by 1, if this is a loss variable
        if self.data.size == 1 and self._grad_var is None:
            if self.data.ndim != 0:
                warnings.warn(
                    'Treating a scalar as a variable with only one element'
                    ' in Variable.backward is deprecated. A scalar variable'
                    ' must be a 0-dimensional array. Apply'
                    ' chainer.functions.squeeze to obtain a scalar variable.'
                    ' If the size of this variable accidentally becomes one,'
                    ' set zero to grad.', DeprecationWarning)
            with cuda.get_device_from_array(self.data) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.data)
                else:
                    self.grad = cuda.cupy.ones_like(self.data)
            if loss_scale is not None:
                self.grad *= loss_scale
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)
        leaf_nodes = set()

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            target_input_indexes = tuple(
                [i for i, x in enumerate(inputs) if x.requires_grad])
            outputs = [y() for y in func.outputs]  # access via weak ref
            out_grad = tuple([grads.pop(y) for y in outputs])
            if not target_input_indexes:
                continue

            in_data = tuple([x.data for x in inputs])
            out_grad_data = tuple(
                [None if g is None else g.data for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            with cuda.get_device_from_array(*(in_data + out_grad_data)):
                for hook in hooks:
                    hook.backward_preprocess(func, in_data, out_grad_data)

                # Collect the current input gradients.
                target_inputs = [inputs[i] for i in target_input_indexes]
                # Keep the order for the portability, rather than
                # in_grad = {x: grads.get_as_list(x)
                #            for x in set(target_inputs)}
                in_grad = collections.OrderedDict()
                for x in target_inputs:
                    if x not in in_grad:
                        in_grad[x] = grads.get_as_list(x)

                _backprop_utils.backprop_step(func, target_input_indexes,
                                              out_grad, in_grad)

                for hook in hooks:
                    hook.backward_postprocess(func, in_data, out_grad_data)

            if is_debug:
                # each grad is a list of variables
                # iter_gxs expands it as a sequence of variables.
                def iter_gxs(gxs):
                    for gx in gxs:
                        for gx_elem in gx:
                            yield gx_elem

                for gx in iter_gxs(in_grad.values()):
                    gx_data = gx.data
                    if gx_data.dtype.kind == 'f':
                        with cuda.get_device_from_array(gx_data):
                            xp = cuda.get_array_module(gx_data)
                            if xp.isnan(gx_data).any():
                                raise RuntimeError(
                                    'NaN is detected on backward computation '
                                    'of {}'.format(func.label))

            for y, gy in six.moves.zip(outputs, out_grad):
                if y is not None and y is not self.node:
                    y_var = y.get_variable_or_none()
                    if y_var is not None:
                        y_var._grad_var = gy if retain_grad else None

            for x, gx in in_grad.items():
                if not gx:  # gradient == None
                    continue

                for gx_elem in gx:
                    _check_grad_type(func, x, gx_elem.data)

                if x.creator_node is None:  # leaf
                    leaf_nodes.add(x)
                else:
                    add_cand(x.creator_node)

            del in_grad  # to reduce memory usage

        for x in leaf_nodes:
            x_var = x.get_variable_or_none()
            gx = grads.pop(x)
            if x_var is not None:
                x_var._grad_var = gx
                x_var._loss_scale = loss_scale
        grads.assert_no_grads()
Exemplo n.º 44
0
 def setUp(self):
     self.default_debug = chainer.is_debug()
     chainer.set_debug(True)
Exemplo n.º 45
0
    def setUp(self):
        self.default_debug = chainer.is_debug()
        chainer.set_debug(True)

        self.x_data = numpy.random.uniform(-1, 1, (4, 3, 2))
Exemplo n.º 46
0
def _backprop_to_all(outputs, retain_grad, loss_scale):
    """Backprop to all input variables

    Args:
        outputs (list of tuple): each tuple is (y_node, y_grad_var).
            y_grad_var should not be None.
        retain_grad (bool): see docstring of Variable.backward
        loss_scale (float): see docstring of Variable.backward

    """
    OrderedDict = chainer.utils._collections.OrderedDict  # fix py2 memory leak

    cand_funcs = []
    seen_set = set()

    def add_cand(cand):
        if cand not in seen_set:
            # Negate since heapq is min-heap
            heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
            seen_set.add(cand)

    grads = _backprop_utils.GradTable(accumulate_grad_inputs=True)

    leaf_nodes = set()

    for y, gy in outputs:
        grads.accumulate(y, gy)

        func = y.creator_node
        if func is None:  # leaf
            leaf_nodes.add(y)
        else:
            add_cand(func)

    # Fix F812 (Python 2)
    y = None
    del y

    is_debug = chainer.is_debug()
    base_hooks = chainer.get_function_hooks().values()
    while cand_funcs:
        _, _, func = heapq.heappop(cand_funcs)
        inputs = func.inputs
        target_input_indexes = tuple([
            i for i, x in enumerate(inputs) if x.requires_grad
        ])
        outputs = [y() for y in func.outputs]  # access via weak ref
        out_grad = tuple([grads.pop(y)
                          if y is not None and y.creator_node is not None
                          else None
                          for y in outputs])
        if not target_input_indexes:
            continue

        in_data = [x.data for x in inputs]
        out_grad_array = [None if g is None else g.raw_array for g in out_grad]
        if func._n_local_function_hooks != 0:
            local_hooks = collections.OrderedDict(chainer.get_function_hooks())
            local_hooks.update(func.local_function_hooks)
            hooks = local_hooks.values()  # avoid six for performance
        else:
            hooks = base_hooks

        with chainer.using_device(
                backend.get_device_from_array(*(in_data + out_grad_array))):
            for hook in hooks:
                hook.backward_preprocess(
                    func, tuple(in_data), tuple(out_grad_array))

            # Collect the current input gradients.
            target_inputs = [inputs[i] for i in target_input_indexes]
            # Keep the order for the portability, rather than
            # in_grad = {x: grads.get_as_list(x)
            #            for x in set(target_inputs)}
            in_grad = OrderedDict()
            for x in target_inputs:
                if x not in in_grad:
                    in_grad[x] = grads.get_as_list(x)

            _backprop_utils.backprop_step(
                func, target_input_indexes, out_grad, in_grad, is_debug)

            for hook in hooks:
                hook.backward_postprocess(
                    func, tuple(in_data), tuple(out_grad_array))

        if retain_grad:
            # The gradients of the outputs of `func` are final. Store them if
            # retain_grad=True.
            for y, gy in six.moves.zip(outputs, out_grad):
                if y is not None:
                    y._set_grad_var_if_available(gy)
            del gy  # to reduce memory usage
        del out_grad  # to reduce memory usage

        for x, gx in in_grad.items():
            if not gx:  # gradient == None
                continue

            for gx_elem in gx:
                if gx_elem is not None:
                    chainer.variable._check_grad_type(
                        func, x, True, gx_elem.raw_array)
            del gx_elem  # to reduce memory usage

            if x.creator_node is None:  # leaf
                leaf_nodes.add(x)
            else:
                add_cand(x.creator_node)
        del gx, in_grad  # to reduce memory usage

    for x in leaf_nodes:
        x_var = x.get_variable_or_none()
        gx = grads.pop(x)
        if x_var is not None:
            x_var._set_grad_var_without_check(gx)
            x_var._loss_scale = loss_scale
    grads.assert_no_grads()
Exemplo n.º 47
0
    def backward(self, retain_grad=False):
        """Runs error backpropagation (a.k.a. backprop) from this variable.

        On backprop, :meth:`FunctionNode.backward` is called on each
        :class:`FunctionNode` object appearing in the backward graph starting
        from this variable. The backward graph is represented by backward
        references from variable nodes to their creators, and from function
        nodes to their input variable nodes. The backprop stops at all root
        nodes. Some function nodes set ``None`` as gradients of some inputs,
        where further backprop does not take place at such inputs.

        This method uses :data:`grad` as the initial error array. User can
        manually set a gradient array before calling this method. If
        :data:`data` contains only one element (i.e., it is scalar) and
        :data:`grad` is ``None``, then this method automatically complements
        1.0 as the initial error. This is useful on starting backprop from
        some scalar loss value.

        Note that this method does not support *differentiable backprop*. Use
        :func:`grad` to compute the gradient of gradients.

        Args:
            retain_grad (bool): If ``True``, the gradient arrays of all
                intermediate variables are kept. Otherwise, :data:`grad` of the
                intermediate variables are set to ``None`` on appropriate
                timing, which may reduce the maximum memory consumption.

                In most cases of training some models, the purpose of backprop
                is to compute gradients of parameters, not of all variables,
                and therefore it is recommended to set this flag ``False``.

        """
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return
        initial_device = None
        if cuda.available and isinstance(self.data, cuda.cupy.ndarray):
            try:
                initial_device = cuda.Device()
            except cuda.cupy.cuda.runtime.CUDARuntimeError as e:
                if e.status != 38:  # cudaErrorNoDevice
                    raise

        is_debug = chainer.is_debug()

        cand_funcs = []
        seen_set = set()
        grads = {}

        # Initialize error by 1, if this is a loss variable
        if self.data.size == 1 and self._grad_var is None:
            with cuda.get_device_from_array(self.data) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.data)
                else:
                    self.grad = cuda.cupy.ones_like(self.data)
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)

        def get_grad(node):
            if node is None:
                return None
            if node in grads:
                return grads[node]
            return node.grad_var

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            outputs = [y() for y in func.outputs]  # access via weak ref

            in_data = tuple([x.data for x in inputs])
            out_grad = tuple([get_grad(y) for y in outputs])
            out_grad_data = tuple(
                [None if g is None else g.data for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            cuda.get_device_from_array(*in_data).use()
            for hook in hooks:
                hook.backward_preprocess(func, in_data, out_grad_data)

            # Collect the current input gradients.
            #
            # Note (Tokui): When the same variable is passed to multiple input
            # slots (e.g. an expression like ``f(x, x)``), it makes the
            # gradient accumulation complicated since the back-propagated
            # gradients w.r.t. the first and second argument should be
            # accumulated to the current gradient w.r.t. the same variable.
            # In this case, the current implementation passes the current
            # gradient only to the first occurrence of the variable in the
            # input tuple and passes ``None`` to the rest of the occurrences.
            # For example, when the input variables are ``(x, x)``, the
            # input gradient passed to the ``backward_accumulate`` method is
            # ``(gx, None)`` where ``gx`` is the current gradient of ``x``.
            # See also the docstring of ``FunctionNode.backward_accumulate``.
            target_input_indexes = [
                i for i, x in enumerate(inputs) if x.requires_grad
            ]
            target_inputs = [inputs[i] for i in target_input_indexes]
            in_grad = []
            for i, index_i in enumerate(target_input_indexes):
                x = inputs[index_i]
                if x in target_inputs[:i]:
                    # Pass ``None`` for duplicated input variables except for
                    # the first occurrence (see the comment above).
                    gx = None
                elif x in grads:
                    gx = grads[x]
                elif x.creator_node is None:
                    x._check_old_style_gradient()
                    # accumulate the gradient only if the node is a leaf
                    gx = x.grad_var
                else:
                    gx = None
                in_grad.append(gx)

            gxs = func.backward_accumulate(target_input_indexes, out_grad,
                                           in_grad)

            assert len(gxs) == len(in_grad)
            for hook in hooks:
                hook.backward_postprocess(func, in_data, out_grad_data)

            if is_debug:
                for gx in gxs:
                    if gx is None:
                        continue
                    gx_data = gx.data
                    cuda.get_device_from_array(gx_data).use()
                    if cuda.get_array_module(gx_data).isnan(gx_data).any():
                        msg = 'NaN is detected on backward computation'
                        raise RuntimeError(msg)

            if not retain_grad:
                for y in outputs:
                    if y is not None and y is not self.node:
                        grads[y] = None
                        y_var = y.get_variable()
                        if y_var is not None:
                            y_var._grad_var = None

            for i, gx in enumerate(gxs):
                if gx is None:
                    continue

                x = target_inputs[i]
                if not x.requires_grad:
                    continue

                _check_grad_type(func, x, gx.data)

                if x in target_inputs[:i]:
                    # Accumulate the duplicated gradients here. See the comment
                    # above the code that builds ``in_grad``.
                    cur_gx = grads[x]
                    grads[x] = gx if cur_gx is None else gx + cur_gx
                else:
                    grads[x] = gx

                x_var = x.get_variable()
                if x_var is not None:
                    x_var._grad_var = grads[x]

                if x.creator_node is not None:
                    add_cand(x.creator_node)

            del gxs  # to reduce memory usage
            if initial_device is not None:
                initial_device.use()
Exemplo n.º 48
0
    def apply(self, inputs):
        """Computes output variables and grows the computational graph.

        Basic behavior is expressed in the documentation of
        :class:`FunctionNode`.

        .. note::

           If the :data:`~Variable.data` attribute of input variables exist on
           a GPU device, that device is made current before calling
           :meth:`forward`, so implementors do not need to take care of device
           selection in most cases.

        Args:
            inputs: Tuple of input variables. Each element can be either
                :class:`Variable`, :class:`numpy.ndarray`,
                or :class:`cupy.ndarray`. If the element is an ndarray, it is
                automatically wrapped with :class:`Variable`.

        Returns:
            A tuple of output :class:`Variable` objects.

        """
        input_vars = [
            x if isinstance(x, variable.Variable) else variable.Variable(
                x, requires_grad=False) for x in inputs
        ]
        in_data = tuple([x.data for x in input_vars])
        requires_grad = any([x.requires_grad for x in input_vars])

        if chainer.is_debug():
            self.stack = traceback.extract_stack()

        if configuration.config.type_check:
            self._check_data_type_forward(in_data)

        hooks = chainer.get_function_hooks()
        if self._n_local_function_hooks > 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(self.local_function_hooks)
        hooks = hooks.values()  # avoid six for performance

        for hook in hooks:
            hook.forward_preprocess(self, in_data)

        # Forward propagation
        with cuda.get_device_from_array(*in_data):
            self._input_indexes_to_retain = None
            self._output_indexes_to_retain = None
            outputs = self.forward(in_data)
            assert type(outputs) is tuple

        for hook in hooks:
            hook.forward_postprocess(self, in_data)

        # NaN check of output values
        if chainer.is_debug():
            if any(out.dtype.kind == 'f'
                   and cuda.get_array_module(out).isnan(out).any()
                   for out in outputs):
                msg = ('NaN is detected on forward computation of '
                       '{}'.format(self.label))
                raise RuntimeError(msg)

        ret = tuple([
            variable.Variable(y, requires_grad=requires_grad) for y in outputs
        ])

        if configuration.config.enable_backprop:
            # Topological ordering
            self.rank = max([x.rank for x in input_vars]) if input_vars else 0
            # Add backward edges
            for i, y in enumerate(ret):
                y.creator_node = self
            self.inputs = tuple([x.node for x in input_vars])
            # Add forward edges (must be weak references)
            self.outputs = tuple([weakref.ref(y.node) for y in ret])

            if self._input_indexes_to_retain is not None:
                for index in self._input_indexes_to_retain:
                    input_vars[index].retain_data()

            if self._output_indexes_to_retain is not None:
                retained_data = []
                for index in self._output_indexes_to_retain:
                    ret[index].retain_data()
                    retained_data.append(outputs[index])
                self._retained_output_data = tuple(retained_data)

        return ret
Exemplo n.º 49
0
    def __call__(self, *inputs):
        """Applies forward propagation with chaining backward references.

        Basic behavior is expressed in documentation of :class:`Function`
        class.

        .. note::

           If the :data:`~Variable.data` attribute of input variables exist on
           GPU device, then, before it calls :meth:`forward` method, the
           appropriate device is selected, so in most cases implementers do
           not need to take care of device selection.

        Args:
            inputs: Tuple of input :class:`Variable`, :class:`numpy.ndarray` or
                :class:`cupy.ndarray` objects.
                If the input is an :class:`numpy.ndarray` or a
                :class:`cupy.ndarray`, it is automatically wrapped with
                :class:`Variable`.

        Returns:
            One :class:`Variable` object or a tuple of multiple
            :class:`Variable` objects.

        """

        inputs = [
            x if isinstance(x, variable.Variable) else variable.Variable(
                x, requires_grad=False) for x in inputs
        ]
        in_data = tuple([x.data for x in inputs])
        requires_grad = any([x.requires_grad for x in inputs])

        if chainer.is_debug():
            self._stack = traceback.extract_stack()

        if configuration.config.type_check:
            self._check_data_type_forward(in_data)

        hooks = chainer.get_function_hooks()
        if self._n_local_function_hooks != 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(self.local_function_hooks)
        for hook in six.itervalues(hooks):
            hook.forward_preprocess(self, in_data)

        # Forward prop
        with cuda.get_device_from_array(*in_data):
            self._input_indexes_to_retain = None
            self._output_indexes_to_retain = None
            outputs = self.forward(in_data)
            assert type(outputs) == tuple
        for hook in six.itervalues(hooks):
            hook.forward_postprocess(self, in_data)

        if chainer.is_debug():
            if any(out.dtype.kind == 'f'
                   and cuda.get_array_module(out).isnan(out).any()
                   for out in outputs):
                msg = 'NaN is detected on forward computation'
                raise RuntimeError(msg)

        ret = tuple([
            variable.Variable(y, requires_grad=requires_grad) for y in outputs
        ])

        if configuration.config.enable_backprop:
            # Topological ordering
            self.rank = max([x.rank for x in inputs]) if inputs else 0
            # Backward edges
            for y in ret:
                y.set_creator(self)
            self.inputs = tuple([x.node for x in inputs])
            # Forward edges (must be weak references)
            self.outputs = tuple([weakref.ref(y.node) for y in ret])

            input_indexes_to_retain = self._input_indexes_to_retain
            if input_indexes_to_retain is None:
                # input arrays are retained by default
                input_indexes_to_retain = six.moves.range(len(inputs))
            for index in input_indexes_to_retain:
                inputs[index].retain_data()
            del self._input_indexes_to_retain

            output_indexes_to_retain = self._output_indexes_to_retain
            if output_indexes_to_retain is not None:
                for index in output_indexes_to_retain:
                    ret[index].retain_data()
            del self._output_indexes_to_retain

        if len(ret) == 1:
            return ret[0]
        else:
            return ret
Exemplo n.º 50
0
    def _backward_main(self, retain_grad):
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return
        initial_device = None
        if cuda.available and isinstance(self.data, cuda.cupy.ndarray):
            try:
                initial_device = cuda.Device()
            except cuda.cupy.cuda.runtime.CUDARuntimeError as e:
                if e.status != 38:  # cudaErrorNoDevice
                    raise

        is_debug = chainer.is_debug()

        cand_funcs = []
        seen_set = set()
        grads = {}

        # Initialize error by 1, if this is a loss variable
        if self.data.size == 1 and self._grad_var is None:
            with cuda.get_device_from_array(self.data) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.data)
                else:
                    self.grad = cuda.cupy.ones_like(self.data)
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)

        def get_grad(node):
            if node is None:
                return None
            if node in grads:
                return grads[node]
            return node.grad_var

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            target_input_indexes = [
                i for i, x in enumerate(inputs) if x.requires_grad
            ]
            if not target_input_indexes:
                continue
            outputs = [y() for y in func.outputs]  # access via weak ref

            in_data = tuple([x.data for x in inputs])
            out_grad = tuple([get_grad(y) for y in outputs])
            out_grad_data = tuple(
                [None if g is None else g.data for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            cuda.get_device_from_array(*in_data).use()
            for hook in hooks:
                hook.backward_preprocess(func, in_data, out_grad_data)

            # Collect the current input gradients.
            #
            # Note (Tokui): When the same variable is passed to multiple input
            # slots (e.g. an expression like ``f(x, x)``), it makes the
            # gradient accumulation complicated since the back-propagated
            # gradients w.r.t. the first and second argument should be
            # accumulated to the current gradient w.r.t. the same variable.
            # In this case, the current implementation passes the current
            # gradient only to the first occurrence of the variable in the
            # input tuple and passes ``None`` to the rest of the occurrences.
            # For example, when the input variables are ``(x, x)``, the
            # input gradient passed to the ``backward_accumulate`` method is
            # ``(gx, None)`` where ``gx`` is the current gradient of ``x``.
            # See also the docstring of ``FunctionNode.backward_accumulate``.
            target_inputs = [inputs[i] for i in target_input_indexes]
            in_grad = []
            for i, index_i in enumerate(target_input_indexes):
                x = inputs[index_i]
                if x in target_inputs[:i]:
                    # Pass ``None`` for duplicated input variables except for
                    # the first occurrence (see the comment above).
                    gx = None
                elif x in grads:
                    gx = grads[x]
                elif x.creator_node is None:
                    x._check_old_style_gradient()
                    # accumulate the gradient only if the node is a leaf
                    gx = x.grad_var
                else:
                    gx = None
                in_grad.append(gx)

            gxs = func.backward_accumulate(target_input_indexes, out_grad,
                                           in_grad)

            assert len(gxs) == len(in_grad)
            for hook in hooks:
                hook.backward_postprocess(func, in_data, out_grad_data)

            if is_debug:
                for gx in gxs:
                    if gx is None:
                        continue
                    gx_data = gx.data
                    if gx_data.dtype.kind == 'f':
                        cuda.get_device_from_array(gx_data).use()
                        if cuda.get_array_module(gx_data).isnan(gx_data).any():
                            raise RuntimeError(
                                'NaN is detected on backward computation of '
                                '{}'.format(func.label))

            if not retain_grad:
                for y in outputs:
                    if y is not None and y is not self.node:
                        grads[y] = None
                        y_var = y.get_variable()
                        if y_var is not None:
                            y_var._grad_var = None

            for i, gx in enumerate(gxs):
                if gx is None:
                    continue

                x = target_inputs[i]
                if not x.requires_grad:
                    continue

                _check_grad_type(func, x, gx.data)

                if x in target_inputs[:i]:
                    # Accumulate the duplicated gradients here. See the comment
                    # above the code that builds ``in_grad``.
                    cur_gx = grads[x]
                    grads[x] = gx if cur_gx is None else gx + cur_gx
                else:
                    grads[x] = gx

                x_var = x.get_variable()
                if x_var is not None:
                    x_var._grad_var = grads[x]

                if x.creator_node is not None:
                    add_cand(x.creator_node)

            del gxs  # to reduce memory usage
            if initial_device is not None:
                initial_device.use()
Exemplo n.º 51
0
 def setUp(self):
     self.original_debug = chainer.is_debug()
     chainer.set_debug(True)
     self.one = numpy.array([1], numpy.float32)
     self.f = chainer.FunctionNode()
Exemplo n.º 52
0
    def forward_gpu(self, inputs):
        class_weight = backend.from_chx(self.class_weight)

        self.retain_inputs((0, 1))
        x, t = inputs
        if x.ndim == t.ndim and x.shape == t.shape:
            self.soft_target = True
        cupy = cuda.cupy
        if chainer.is_debug() and not self.soft_target:
            _check_input_values(x, t, self.ignore_label)

        if x.size == 0:
            y = cupy.zeros(t.shape, dtype=x.dtype)
            if self.cache_score:
                self.y = y
            if self.reduce == 'mean':
                return y.sum(),
            else:
                return y,
        log_y = log_softmax._log_softmax(x)
        if self.cache_score:
            self.y = cupy.exp(log_y)

        if self.soft_target:
            return self._soft_target_loss(cupy, x, t, log_y)

        if class_weight is not None:
            shape = [1 if d != 1 else -1 for d in six.moves.range(x.ndim)]
            log_y *= cupy.broadcast_to(class_weight.reshape(shape), x.shape)

        log_y = cupy.rollaxis(log_y, 1, log_y.ndim)

        if self.reduce == 'mean':
            # Reduction is performed in a promoted dtype
            reduc_dtype = _reduction_dtype(x.dtype)
            if self.normalize:
                count = (t != self.ignore_label).sum(dtype=reduc_dtype)
                count = cupy.maximum(1, count)
                coeff = 1. / count
            else:
                coeff = cupy.array(1. / max(1, len(t)), dtype=reduc_dtype)
            self._coeff = coeff

            ret = cuda.reduce(
                'S t, raw T log_y, int32 n_channel, raw U coeff, '
                'S ignore_label', 'U out',
                't == ignore_label ? T(0) : log_y[_j * n_channel + t]',
                'a + b', 'out = static_cast<U>(a * -coeff[0])', '0',
                'crossent_fwd')(t, log_y.reduced_view(), log_y.shape[-1],
                                self._coeff, self.ignore_label)
            ret = ret.astype(log_y.dtype, copy=False)
        else:
            ret = cuda.elementwise(
                'S t, raw T log_y, int32 n_channel, T ignore', 'T out', '''
                if (t == ignore) {
                  out = 0;
                } else {
                  out = -log_y[i * n_channel + t];
                }
                ''', 'softmax_crossent_no_reduce_fwd')(t, log_y.reduced_view(),
                                                       log_y.shape[-1],
                                                       self.ignore_label)
            ret = ret.reshape(t.shape)
        return ret,
Exemplo n.º 53
0
    def __call__(self, *inputs):
        """Applies forward propagation with chaining backward references.

        Basic behavior is expressed in documentation of :class:`Function`
        class.

        .. note::

           If the :data:`~Variable.data` attribute of input variables exist on
           GPU device, then, before it calls :meth:`forward` method, the
           appropriate device is selected, so in most cases implementers do
           not need to take care of device selection.

        Args:
            inputs: Tuple of input :class:`Variable`, :class:`numpy.ndarray` or
                :class:`cupy.ndarray` objects. The volatile flags of all input
                variables must agree. If the input is an :class:`numpy.ndarray`
                or a :class:`cupy.ndarray`, it is automatically wrapped with
                :class:`Variable`.

        Returns:
            One :class:`Variable` object or a tuple of multiple
            :class:`Variable` objects.

        """

        inputs = [
            x if isinstance(x, chainer.Variable) else chainer.Variable(
                x, volatile=flag.AUTO) for x in inputs
        ]

        in_data = tuple([x.data for x in inputs])
        if chainer.is_debug():
            self._stack = traceback.extract_stack()

        if self.type_check_enable:
            self._check_data_type_forward(in_data)

        hooks = chainer.get_function_hooks()
        if self._n_local_function_hooks != 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(self.local_function_hooks)
        for hook in six.itervalues(hooks):
            hook.forward_preprocess(self, in_data)
        # Forward prop
        with cuda.get_device(*in_data):
            outputs = self.forward(in_data)
            assert type(outputs) == tuple
        for hook in six.itervalues(hooks):
            hook.forward_postprocess(self, in_data)

        if chainer.is_debug():
            if any(out.dtype.kind == 'f'
                   and cuda.get_array_module(out).isnan(out).any()
                   for out in outputs):
                msg = 'NaN is detected on forward computation'
                raise RuntimeError(msg)

        out_v = flag.aggregate_flags([x.volatile for x in inputs])
        ret = tuple([variable.Variable(y, volatile=out_v) for y in outputs])

        if out_v == 'on':
            build_graph = False
        elif out_v == 'off':
            build_graph = True
        else:
            build_graph = getattr(_thread_local, 'default_backprop', True)

        if build_graph:
            # Topological ordering
            self.rank = max([x.rank for x in inputs]) if inputs else 0
            # Backward edges
            for y in ret:
                y.set_creator(self)
            self.inputs = inputs
            # Forward edges (must be weak references)
            self.outputs = tuple([weakref.ref(y) for y in ret])

        if len(ret) == 1:
            return ret[0]
        else:
            return ret
Exemplo n.º 54
0
def backprop_step(func, target_input_indexes, grad_outputs, grad_inputs):
    """Accumulates gradients of a FunctionNode

    This routine is used by :meth:`chainer.Variable.backward` and
    :func:`chainer.grad`.

    Args:
        func (~chainer.FunctionNode): The function for which gradients are
            accumulated.
        target_input_indexes (tuple of int): Sorted indices of the inputs
            that require gradients. It is guaranteed that this tuple contains
            at least one element.
        grad_outputs (tuple of Variable): Gradients w.r.t. the output
            variables. If the gradient w.r.t. an output variable is not
            given, the corresponding element is ``None``.
        grad_inputs (dict): References of the gradients w.r.t. the input
            variables.

    """
    is_debug = chainer.is_debug()
    if is_debug:
        assert isinstance(target_input_indexes, tuple)
        assert target_input_indexes == tuple(sorted(target_input_indexes))
        assert isinstance(grad_outputs, tuple)
    if func.backward_accumulate.__code__ \
            is not chainer.FunctionNode.backward_accumulate.__code__:
        # backward_accumulate is overridden
        grad_inputs_tuple = tuple([
            _pop_or_none(grad_inputs[func.inputs[i]])
            for i in target_input_indexes
        ])

        # Call backward_accumulate()
        try:
            gxs = func.backward_accumulate(target_input_indexes, grad_outputs,
                                           grad_inputs_tuple)
        except Exception as e:
            _reraise_with_stack(func, e)

    else:  # otherwise, backward should be overridden

        # Call backward()
        try:
            gxs = func.backward(target_input_indexes, grad_outputs)
        except Exception as e:
            _reraise_with_stack(func, e)

        if is_debug:
            for gx in gxs:
                if not (gx is None or isinstance(gx, chainer.Variable)):
                    raise ValueError(
                        func._get_error_message(
                            'type of gradients returned from backward is '
                            'incorrect: '
                            '{} != expected {}'.format(type(gx),
                                                       chainer.Variable)))

        len_gxs = len(gxs)
        if len_gxs == len(func.inputs):
            gxs = tuple([gxs[i] for i in target_input_indexes])
        elif len_gxs != len(target_input_indexes):
            msg = 'number of gradients returned from backward is incorrect: '
            if len(func.inputs) == len(target_input_indexes):
                msg += ('%s != expected %s' % (len_gxs, len(func.inputs)))
            else:
                msg += ('%s != expected %s or %s' %
                        (len_gxs, len(func.inputs), len(target_input_indexes)))
            raise ValueError(func._get_error_message(msg))

    for i, gx in six.moves.zip(target_input_indexes, gxs):
        if gx is not None:
            grad_inputs[func.inputs[i]].append(gx)

            if is_debug:
                node_x = func.inputs[i]
                g_input_list = grad_inputs[node_x]
                if gx.shape != node_x.shape:
                    raise ValueError(
                        func._get_error_message(
                            'shape of gradients returned from backward is '
                            'incorrect: '
                            'input-index={}, actual {} != expected {}'.format(
                                i, gx.shape, node_x.shape)))
                if gx is not None and g_input_list:
                    g_input = g_input_list[0]
                    if gx.shape != g_input.shape:
                        raise ValueError(
                            func._get_error_message(
                                'shape of gradients returned from backward is '
                                'incorrect: '
                                'input-index={}, actual {} != expected {}'.
                                format(i, gx.shape, g_input.shape)))
                    if gx.dtype != g_input.dtype:
                        raise ValueError(
                            func._get_error_message(
                                'dtype of gradients returned from backward is '
                                'incorrect: '
                                'input-index={}, actual {} != expected {}'.
                                format(i, gx.dtype, g_input.dtype)))
    del gxs

    if is_debug:
        # each grad is a list of variables
        # iter_gxs expands it as a sequence of variables.
        def iter_gxs(gxs):
            for gx in gxs:
                for gx_elem in gx:
                    yield gx_elem

        for gx in iter_gxs(grad_inputs.values()):
            if chainer.backend._contains_nan(gx.data):
                raise RuntimeError(
                    'NaN is detected on backward computation of {}'.format(
                        func.label))

    if not func.lazy_grad_sum:
        for gx in grad_inputs.values():
            _reduce(gx)