Пример #1
0
    def check_backprop_step(self, gxs):
        flag_none = gxs[0] is None

        x1 = chainer.Variable(self.x1)
        x2 = chainer.Variable(self.x2)
        self.f.inputs = (x1.node, x2.node)
        gxrefs = [[gx] if gx is not None else [] for gx in gxs]
        grad_outputs = (self.gy1, self.gy2)
        grad_inputs = dict(zip(self.f.inputs, gxrefs))
        _backprop_utils.backprop_step(self.f, (0, 1), grad_outputs,
                                      grad_inputs, True)
        if not chainer.configuration.config.lazy_grad_sum:
            # assert eager grad sum
            for gxref in gxrefs:
                self.assertLessEqual(len(gxref), 1)
        gx1 = _backprop_utils._reduce(gxrefs[0])
        gx2 = _backprop_utils._reduce(gxrefs[1])
        if flag_none:
            numpy.testing.assert_array_equal(cuda.to_cpu(gx1.data),
                                             cuda.to_cpu(self.gx1.data))
            self.assertIsNone(gx2)
        else:
            numpy.testing.assert_array_equal(cuda.to_cpu(gx1.data),
                                             cuda.to_cpu(self.gx1_accum.data))
            numpy.testing.assert_array_equal(cuda.to_cpu(gx2.data),
                                             cuda.to_cpu(self.gx2_orig.data))
Пример #2
0
    def check_backprop_step(self, gxs):
        flag_none = gxs[0] is None

        x1 = chainer.Variable(self.x1)
        x2 = chainer.Variable(self.x2)
        self.f.inputs = (x1.node, x2.node)
        gxrefs = [[gx] if gx is not None else [] for gx in gxs]
        grad_outputs = (self.gy1, self.gy2)
        grad_inputs = dict(zip(self.f.inputs, gxrefs))
        _backprop_utils.backprop_step(
            self.f, (0, 1), grad_outputs, grad_inputs, True)
        if not chainer.configuration.config.lazy_grad_sum:
            # assert eager grad sum
            for gxref in gxrefs:
                self.assertLessEqual(len(gxref), 1)
        gx1 = _backprop_utils._reduce(gxrefs[0])
        gx2 = _backprop_utils._reduce(gxrefs[1])
        if flag_none:
            numpy.testing.assert_array_equal(cuda.to_cpu(gx1.data),
                                             cuda.to_cpu(self.gx1.data))
            self.assertIsNone(gx2)
        else:
            numpy.testing.assert_array_equal(cuda.to_cpu(gx1.data),
                                             cuda.to_cpu(self.gx1_accum.data))
            numpy.testing.assert_array_equal(cuda.to_cpu(gx2.data),
                                             cuda.to_cpu(self.gx2_orig.data))
Пример #3
0
    def _backward_main(self, retain_grad, loss_scale):
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return
        initial_device = None
        if cuda.available and isinstance(self.data, cuda.ndarray):
            try:
                initial_device = cuda.Device()
            except cuda.cupy.cuda.runtime.CUDARuntimeError as e:
                if e.status != 38:  # cudaErrorNoDevice
                    raise

        is_debug = chainer.is_debug()

        cand_funcs = []
        seen_set = set()
        grads = _backprop_utils.GradTable(load_if_new=True)

        # Initialize error by 1, if this is a loss variable
        if self.data.size == 1 and self._grad_var is None:
            if self.data.ndim != 0:
                warnings.warn(
                    'Treating a scalar as a variable with only one element'
                    ' in Variable.backward is deprecated. A scalar variable'
                    ' must be a 0-dimensional array. Apply'
                    ' chainer.functions.squeeze to obtain a scalar variable.'
                    ' If the size of this variable accidentally becomes one,'
                    ' set zero to grad.',
                    DeprecationWarning)
            with cuda.get_device_from_array(self.data) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.data)
                else:
                    self.grad = cuda.cupy.ones_like(self.data)
            if loss_scale is not None:
                self.grad *= loss_scale
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)
        leaf_nodes = set()

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            target_input_indexes = tuple([
                i for i, x in enumerate(inputs) if x.requires_grad
            ])
            outputs = [y() for y in func.outputs]  # access via weak ref
            out_grad = tuple([grads.pop(y) for y in outputs])
            if not target_input_indexes:
                continue

            in_data = tuple([x.data for x in inputs])
            out_grad_data = tuple(
                [None if g is None else g.data for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            cuda.get_device_from_array(*(in_data + out_grad_data)).use()
            for hook in hooks:
                hook.backward_preprocess(func, in_data, out_grad_data)

            # Collect the current input gradients.
            target_inputs = [inputs[i] for i in target_input_indexes]
            # Keep the order for the portability, rather than
            # in_grad = {x: grads.get_as_list(x) for x in set(target_inputs)}
            in_grad = collections.OrderedDict()
            for x in target_inputs:
                if x not in in_grad:
                    in_grad[x] = grads.get_as_list(x)

            _backprop_utils.backprop_step(
                func, target_input_indexes, out_grad, in_grad)

            for hook in hooks:
                hook.backward_postprocess(func, in_data, out_grad_data)

            if is_debug:
                # each grad is a list of variables
                # iter_gxs expands it as a sequence of variables.
                def iter_gxs(gxs):
                    for gx in gxs:
                        for gx_elem in gx:
                            yield gx_elem

                for gx in iter_gxs(in_grad.values()):
                    gx_data = gx.data
                    if gx_data.dtype.kind == 'f':
                        cuda.get_device_from_array(gx_data).use()
                        if cuda.get_array_module(gx_data).isnan(gx_data).any():
                            raise RuntimeError(
                                'NaN is detected on backward computation of '
                                '{}'.format(func.label))

            for y, gy in six.moves.zip(outputs, out_grad):
                if y is not None and y is not self.node:
                    y_var = y.get_variable_or_none()
                    if y_var is not None:
                        y_var._grad_var = gy if retain_grad else None

            for x, gx in in_grad.items():
                if not gx:  # gradient == None
                    continue

                for gx_elem in gx:
                    _check_grad_type(func, x, gx_elem.data)

                if x.creator_node is None:  # leaf
                    leaf_nodes.add(x)
                else:
                    add_cand(x.creator_node)

            del in_grad  # to reduce memory usage
            if initial_device is not None:
                initial_device.use()

        for x in leaf_nodes:
            x_var = x.get_variable_or_none()
            gx = grads.pop(x)
            if x_var is not None:
                x_var._grad_var = gx
                x_var._loss_scale = loss_scale
        grads.assert_no_grads()
Пример #4
0
    def _backward_main(self, retain_grad, loss_scale):
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return

        cand_funcs = []
        seen_set = set()
        grads = _backprop_utils.GradTable(load_if_new=True)

        # Initialize error by 1, if this is a loss variable
        if self.array.size == 1 and self._grad_var is None:
            if self.array.ndim != 0:
                warnings.warn(
                    'Treating a scalar as a variable with only one element'
                    ' in Variable.backward is deprecated. A scalar variable'
                    ' must be a 0-dimensional array. Apply'
                    ' chainer.functions.squeeze to obtain a scalar variable.'
                    ' If the size of this variable accidentally becomes one,'
                    ' set zero to grad.',
                    DeprecationWarning)
            with cuda.get_device_from_array(self.array) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.array)
                else:
                    self.grad = cuda.cupy.ones_like(self.array)
            if loss_scale is not None:
                self.grad *= loss_scale
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)
        leaf_nodes = set()

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            target_input_indexes = tuple([
                i for i, x in enumerate(inputs) if x.requires_grad
            ])
            outputs = [y() for y in func.outputs]  # access via weak ref
            out_grad = tuple([grads.pop(y) for y in outputs])
            if not target_input_indexes:
                continue

            in_data = tuple([x.data for x in inputs])
            out_grad_array = tuple(
                [None if g is None else g.array for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            with cuda.get_device_from_array(*(in_data + out_grad_array)):
                for hook in hooks:
                    hook.backward_preprocess(func, in_data, out_grad_array)

                # Collect the current input gradients.
                target_inputs = [inputs[i] for i in target_input_indexes]
                # Keep the order for the portability, rather than
                # in_grad = {x: grads.get_as_list(x)
                #            for x in set(target_inputs)}
                in_grad = collections.OrderedDict()
                for x in target_inputs:
                    if x not in in_grad:
                        in_grad[x] = grads.get_as_list(x)
                        # to reduce memory usage
                        x._set_grad_var_if_available(None)

                _backprop_utils.backprop_step(
                    func, target_input_indexes, out_grad, in_grad)

                for hook in hooks:
                    hook.backward_postprocess(func, in_data, out_grad_array)

            for y, gy in six.moves.zip(outputs, out_grad):
                if y is not None and y is not self.node:
                    y._set_grad_var_if_available(
                        gy if retain_grad else None)
            del gy, out_grad  # to reduce memory usage

            for x, gx in in_grad.items():
                if not gx:  # gradient == None
                    continue

                for gx_elem in gx:
                    _check_grad_type(func, x, gx_elem.array)
                del gx_elem  # to reduce memory usage

                if x.creator_node is None:  # leaf
                    leaf_nodes.add(x)
                else:
                    add_cand(x.creator_node)
            del gx, in_grad  # to reduce memory usage

        for x in leaf_nodes:
            x_var = x.get_variable_or_none()
            gx = grads.pop(x)
            if x_var is not None:
                x_var._grad_var = gx
                x_var._loss_scale = loss_scale
        grads.assert_no_grads()
Пример #5
0
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale):
    candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap()

    for y in outputs:
        creator = y.creator_node
        if creator is not None:
            push_candidate(creator)

    input_nodes = set(x.node for x in inputs)
    ret_dict = {}

    while candidate_funcs:
        func = pop_candidate()

        # Collect the gradients w.r.t. the outputs
        ys = [y() for y in func.outputs]  # access via weak ref
        gys = tuple([grads.pop(y) for y in ys])

        for node, gy in six.moves.zip(ys, gys):
            if node is not None:
                if node in input_nodes:
                    ret_dict[node] = gy

                if retain_grad:
                    y = node.get_variable_or_none()
                    if y is not None:
                        y.grad_var = gy
                        y._loss_scale = loss_scale

        # Collect the gradients w.r.t. the inputs
        input_indexes = []
        x_grads = collections.OrderedDict()
        for i, x in enumerate(func.inputs):
            if x not in grad_required:
                continue
            input_indexes.append(i)
            if x not in x_grads:
                x_grads[x] = grads.get_as_list(x)
        if not input_indexes:
            continue
        input_indexes = tuple(input_indexes)

        # Do backward

        # Call pre-backward hooks
        hooks = chainer.get_function_hooks()
        if func._n_local_function_hooks != 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(func.local_function_hooks)
        hooks = hooks.values()  # avoid six for performance

        in_data = tuple([x.data for x in func.inputs])
        out_grad_data = tuple([None if g is None else g.data for g in gys])

        with cuda.get_device_from_array(*in_data):
            for hook in hooks:
                hook.backward_preprocess(func, in_data, out_grad_data)

            _backprop_utils.backprop_step(func, input_indexes, gys, x_grads)

            # Call post-backward hooks
            for hook in hooks:
                hook.backward_postprocess(func, in_data, out_grad_data)

        # Update grads
        for node, g in x_grads.items():
            if not g:  # gradient == None
                continue

            creator = node.creator_node
            if creator is not None:
                push_candidate(creator)

    for x in input_nodes:
        if x not in ret_dict:
            ret_dict[x] = grads.pop(x)
    return ret_dict
Пример #6
0
    def _backward_main(self, retain_grad, loss_scale):
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return

        # fix py2 memory leak
        OrderedDict = chainer.utils._collections.OrderedDict

        cand_funcs = []
        seen_set = set()
        grads = _backprop_utils.GradTable(load_if_new=True)

        # Initialize error by 1, if this is a loss variable
        if self.array.size == 1 and self._grad_var is None:
            if self.array.ndim != 0:
                warnings.warn(
                    'Treating a variable with only one element as a scalar'
                    ' in Variable.backward is deprecated. A scalar variable'
                    ' must be a 0-dimensional array. Apply'
                    ' chainer.functions.squeeze to obtain a scalar variable.'
                    ' If the size of this variable accidentally becomes one,'
                    ' set zero to grad.', DeprecationWarning)
            with cuda.get_device_from_array(self.array) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.array)
                else:
                    self.grad = cuda.cupy.ones_like(self.array)
            if loss_scale is not None:
                self.grad *= loss_scale
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)
        leaf_nodes = set()

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            target_input_indexes = tuple(
                [i for i, x in enumerate(inputs) if x.requires_grad])
            outputs = [y() for y in func.outputs]  # access via weak ref
            out_grad = tuple([grads.pop(y) for y in outputs])
            if not target_input_indexes:
                continue

            in_data = tuple([x.data for x in inputs])
            out_grad_array = tuple(
                [None if g is None else g.array for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            with cuda.get_device_from_array(*(in_data + out_grad_array)):
                for hook in hooks:
                    hook.backward_preprocess(func, in_data, out_grad_array)

                # Collect the current input gradients.
                target_inputs = [inputs[i] for i in target_input_indexes]
                # Keep the order for the portability, rather than
                # in_grad = {x: grads.get_as_list(x)
                #            for x in set(target_inputs)}
                in_grad = OrderedDict()
                for x in target_inputs:
                    if x not in in_grad:
                        in_grad[x] = grads.get_as_list(x)
                        # to reduce memory usage
                        x._set_grad_var_if_available(None)

                _backprop_utils.backprop_step(func, target_input_indexes,
                                              out_grad, in_grad)

                for hook in hooks:
                    hook.backward_postprocess(func, in_data, out_grad_array)

            for y, gy in six.moves.zip(outputs, out_grad):
                if y is not None and y is not self.node:
                    y._set_grad_var_if_available(gy if retain_grad else None)
            del gy, out_grad  # to reduce memory usage

            for x, gx in in_grad.items():
                if not gx:  # gradient == None
                    continue

                for gx_elem in gx:
                    _check_grad_type(func, x, True, gx_elem, True)
                del gx_elem  # to reduce memory usage

                if x.creator_node is None:  # leaf
                    leaf_nodes.add(x)
                else:
                    add_cand(x.creator_node)
            del gx, in_grad  # to reduce memory usage

        for x in leaf_nodes:
            x_var = x.get_variable_or_none()
            gx = grads.pop(x)
            if x_var is not None:
                x_var._grad_var = gx
                x_var._loss_scale = loss_scale
        grads.assert_no_grads()
Пример #7
0
    def _backward_main(self, retain_grad, loss_scale):
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return
        initial_device = None
        if cuda.available and isinstance(self.data, cuda.ndarray):
            try:
                initial_device = cuda.Device()
            except cuda.cupy.cuda.runtime.CUDARuntimeError as e:
                if e.status != 38:  # cudaErrorNoDevice
                    raise

        is_debug = chainer.is_debug()

        cand_funcs = []
        seen_set = set()
        grads = _backprop_utils.GradTable(load_if_new=True)

        # Initialize error by 1, if this is a loss variable
        if self.data.size == 1 and self._grad_var is None:
            if self.data.ndim != 0:
                warnings.warn(
                    'Treating a scalar as a variable with only one element'
                    ' in Variable.backward is deprecated. A scalar variable'
                    ' must be a 0-dimensional array. Apply'
                    ' chainer.functions.squeeze to obtain a scalar variable.'
                    ' If the size of this variable accidentally becomes one,'
                    ' set zero to grad.', DeprecationWarning)
            with cuda.get_device_from_array(self.data) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.data)
                else:
                    self.grad = cuda.cupy.ones_like(self.data)
            if loss_scale is not None:
                self.grad *= loss_scale
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)
        leaf_nodes = set()

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            target_input_indexes = tuple(
                [i for i, x in enumerate(inputs) if x.requires_grad])
            outputs = [y() for y in func.outputs]  # access via weak ref
            out_grad = tuple([grads.pop(y) for y in outputs])
            if not target_input_indexes:
                continue

            in_data = tuple([x.data for x in inputs])
            out_grad_data = tuple(
                [None if g is None else g.data for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            cuda.get_device_from_array(*(in_data + out_grad_data)).use()
            for hook in hooks:
                hook.backward_preprocess(func, in_data, out_grad_data)

            # Collect the current input gradients.
            target_inputs = [inputs[i] for i in target_input_indexes]
            # Keep the order for the portability, rather than
            # in_grad = {x: grads.get_as_list(x) for x in set(target_inputs)}
            in_grad = collections.OrderedDict()
            for x in target_inputs:
                if x not in in_grad:
                    in_grad[x] = grads.get_as_list(x)

            _backprop_utils.backprop_step(func, target_input_indexes, out_grad,
                                          in_grad)

            for hook in hooks:
                hook.backward_postprocess(func, in_data, out_grad_data)

            if is_debug:
                # each grad is a list of variables
                # iter_gxs expands it as a sequence of variables.
                def iter_gxs(gxs):
                    for gx in gxs:
                        for gx_elem in gx:
                            yield gx_elem

                for gx in iter_gxs(in_grad.values()):
                    gx_data = gx.data
                    if gx_data.dtype.kind == 'f':
                        cuda.get_device_from_array(gx_data).use()
                        if cuda.get_array_module(gx_data).isnan(gx_data).any():
                            raise RuntimeError(
                                'NaN is detected on backward computation of '
                                '{}'.format(func.label))

            for y, gy in six.moves.zip(outputs, out_grad):
                if y is not None and y is not self.node:
                    y_var = y.get_variable_or_none()
                    if y_var is not None:
                        y_var._grad_var = gy if retain_grad else None

            for x, gx in in_grad.items():
                if not gx:  # gradient == None
                    continue

                for gx_elem in gx:
                    _check_grad_type(func, x, gx_elem.data)

                if x.creator_node is None:  # leaf
                    leaf_nodes.add(x)
                else:
                    add_cand(x.creator_node)

            del in_grad  # to reduce memory usage
            if initial_device is not None:
                initial_device.use()

        for x in leaf_nodes:
            x_var = x.get_variable_or_none()
            gx = grads.pop(x)
            if x_var is not None:
                x_var._grad_var = gx
                x_var._loss_scale = loss_scale
        grads.assert_no_grads()
Пример #8
0
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale):
    candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap()

    for y in outputs:
        creator = y.creator_node
        if creator is not None:
            push_candidate(creator)

    input_nodes = set(x.node for x in inputs)
    ret_dict = {}

    while candidate_funcs:
        func = pop_candidate()

        # Collect the gradients w.r.t. the outputs
        ys = [y() for y in func.outputs]  # access via weak ref
        gys = tuple([grads.pop(y) for y in ys])

        for node, gy in six.moves.zip(ys, gys):
            if node is not None:
                if node in input_nodes:
                    ret_dict[node] = gy

                if retain_grad:
                    y = node.get_variable_or_none()
                    if y is not None:
                        y.grad_var = gy
                        y._loss_scale = loss_scale

        # Collect the gradients w.r.t. the inputs
        input_indexes = []
        x_grads = collections.OrderedDict()
        for i, x in enumerate(func.inputs):
            if x not in grad_required:
                continue
            input_indexes.append(i)
            if x not in x_grads:
                x_grads[x] = grads.get_as_list(x)
        if not input_indexes:
            continue
        input_indexes = tuple(input_indexes)

        # Do backward

        # Call pre-backward hooks
        hooks = chainer.get_function_hooks()
        if func._n_local_function_hooks != 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(func.local_function_hooks)
        hooks = hooks.values()  # avoid six for performance

        in_data = tuple([x.data for x in func.inputs])
        out_grad_data = tuple(
            [None if g is None else g.data for g in gys])

        with cuda.get_device_from_array(*in_data):
            for hook in hooks:
                hook.backward_preprocess(func, in_data, out_grad_data)

            _backprop_utils.backprop_step(func, input_indexes, gys, x_grads)

            # Call post-backward hooks
            for hook in hooks:
                hook.backward_postprocess(func, in_data, out_grad_data)

        # Update grads
        for node, g in x_grads.items():
            if not g:  # gradient == None
                continue

            creator = node.creator_node
            if creator is not None:
                push_candidate(creator)

    for x in input_nodes:
        if x not in ret_dict:
            ret_dict[x] = grads.pop(x)
    return ret_dict
Пример #9
0
def _backprop_to_all(outputs, retain_grad, loss_scale):
    """Backprop to all input variables

    Args:
        outputs (list of tuple): each tuple is (y_node, y_grad_var).
            y_grad_var should not be None.
        retain_grad (bool): see docstring of Variable.backward
        loss_scale (float): see docstring of Variable.backward

    """
    OrderedDict = chainer.utils._collections.OrderedDict  # fix py2 memory leak

    cand_funcs = []
    seen_set = set()

    def add_cand(cand):
        if cand not in seen_set:
            # Negate since heapq is min-heap
            heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
            seen_set.add(cand)

    grads = _backprop_utils.GradTable(accumulate_grad_inputs=True)

    leaf_nodes = set()

    for y, gy in outputs:
        grads.accumulate(y, gy)

        func = y.creator_node
        if func is None:  # leaf
            leaf_nodes.add(y)
        else:
            add_cand(func)

    # Fix F812 (Python 2)
    y = None
    del y

    is_debug = chainer.is_debug()
    base_hooks = chainer.get_function_hooks().values()
    while cand_funcs:
        _, _, func = heapq.heappop(cand_funcs)
        inputs = func.inputs
        target_input_indexes = tuple([
            i for i, x in enumerate(inputs) if x.requires_grad
        ])
        outputs = [y() for y in func.outputs]  # access via weak ref
        out_grad = tuple([grads.pop(y)
                          if y is not None and y.creator_node is not None
                          else None
                          for y in outputs])
        if not target_input_indexes:
            continue

        in_data = [x.data for x in inputs]
        out_grad_array = [None if g is None else g.raw_array for g in out_grad]
        if func._n_local_function_hooks != 0:
            local_hooks = collections.OrderedDict(chainer.get_function_hooks())
            local_hooks.update(func.local_function_hooks)
            hooks = local_hooks.values()  # avoid six for performance
        else:
            hooks = base_hooks

        with chainer.using_device(
                backend.get_device_from_array(*(in_data + out_grad_array))):
            for hook in hooks:
                hook.backward_preprocess(
                    func, tuple(in_data), tuple(out_grad_array))

            # Collect the current input gradients.
            target_inputs = [inputs[i] for i in target_input_indexes]
            # Keep the order for the portability, rather than
            # in_grad = {x: grads.get_as_list(x)
            #            for x in set(target_inputs)}
            in_grad = OrderedDict()
            for x in target_inputs:
                if x not in in_grad:
                    in_grad[x] = grads.get_as_list(x)

            _backprop_utils.backprop_step(
                func, target_input_indexes, out_grad, in_grad, is_debug)

            for hook in hooks:
                hook.backward_postprocess(
                    func, tuple(in_data), tuple(out_grad_array))

        if retain_grad:
            # The gradients of the outputs of `func` are final. Store them if
            # retain_grad=True.
            for y, gy in six.moves.zip(outputs, out_grad):
                if y is not None:
                    y._set_grad_var_if_available(gy)
            del gy  # to reduce memory usage
        del out_grad  # to reduce memory usage

        for x, gx in in_grad.items():
            if not gx:  # gradient == None
                continue

            for gx_elem in gx:
                if gx_elem is not None:
                    chainer.variable._check_grad_type(
                        func, x, True, gx_elem.raw_array)
            del gx_elem  # to reduce memory usage

            if x.creator_node is None:  # leaf
                leaf_nodes.add(x)
            else:
                add_cand(x.creator_node)
        del gx, in_grad  # to reduce memory usage

    for x in leaf_nodes:
        x_var = x.get_variable_or_none()
        gx = grads.pop(x)
        if x_var is not None:
            x_var._set_grad_var_without_check(gx)
            x_var._loss_scale = loss_scale
    grads.assert_no_grads()