示例#1
0
def n_step_lstm_base(
        n_layers, dropout_ratio, hx, cx, ws, bs, xs, train, use_cudnn,
        use_bi_direction):
    """Base function for Stack LSTM/BiLSTM functions.

    This function is used at :func:`chainer.functions.n_step_lstm` and
    :func:`chainer.functions.n_step_bilstm`.
    This function's behavior depends on following arguments,
    ``activation`` and ``use_bi_direction``.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimention of hidden units.
        cx (chainer.Variable): Variable holding stacked cell states.
            It has the same shape as ``hx``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing eight matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing eight vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimention of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        train (bool): If ``True``, this function executes dropout.
        use_cudnn (bool): If ``True``, this function uses cuDNN if available.
        use_bi_direction (bool): If ``True``, this function uses Bi-directional
            LSTM.

    Returns:
        tuple: This functions returns a tuple concaining three elements,
            ``hy``, ``cy`` and ``ys``.
            - ``hy`` is an updated hidden states whose shape is same as ``hx``.
            - ``cy`` is an updated cell states whose shape is same as ``cx``.
            - ``ys`` is a list of :class:`~chainer.Variable` . Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is
              mini-batch size for time ``t``, and ``N`` is size of hidden
              units. Note that ``B_t`` is the same value as ``xs[t]``.

    .. seealso::

       :func:`chainer.functions.n_step_lstm`
       :func:`chainer.functions.n_step_bilstm`

    """

    xp = cuda.get_array_module(hx, hx.data)

    if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \
       _cudnn_version >= 5000:
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(itertools.chain(
            (hx, cx),
            itertools.chain.from_iterable(ws),
            itertools.chain.from_iterable(bs),
            xs))
        if use_bi_direction:
            rnn = NStepBiLSTM(n_layers, states, train=train)
        else:
            rnn = NStepLSTM(n_layers, states, train=train)

        ret = rnn(*inputs)
        hy, cy = ret[:2]
        ys = ret[2:]
        return hy, cy, ys

    else:
        direction = 2 if use_bi_direction else 1
        split_size = n_layers * direction
        hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]
        cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True)
        cx = [reshape.reshape(c, c.shape[1:]) for c in cx]

        xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws]
        hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws]
        xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs]
        hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs]

        xs_next = xs
        hy = []
        cy = []
        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward LSTM
                # di=1, backward LSTM
                h_list = []
                c_list = []
                layer_idx = direction * layer + di
                h = hx[layer_idx]
                c = cx[layer_idx]
                if di == 0:
                    xs_list = xs_next
                else:
                    xs_list = reversed(xs_next)
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                        c, c_rest = split_axis.split_axis(c, [batch], axis=0)
                    else:
                        h_rest = None
                        c_rest = None

                    if layer != 0:
                        x = dropout.dropout(x, ratio=dropout_ratio,
                                            train=train)
                    lstm_in = linear.linear(x, xws[layer_idx],
                                            xbs[layer_idx]) + \
                        linear.linear(h, hws[layer_idx], hbs[layer_idx])

                    c_bar, h_bar = lstm.lstm(c, lstm_in)
                    if h_rest is not None:
                        h = concat.concat([h_bar, h_rest], axis=0)
                        c = concat.concat([c_bar, c_rest], axis=0)
                    else:
                        h = h_bar
                        c = c_bar
                    h_list.append(h_bar)
                    c_list.append(c_bar)
                return h, c, h_list, c_list

            h, c, h_forward, c_forward = _one_directional_loop(di=0)
            hy.append(h)
            cy.append(c)

            if use_bi_direction:
                # BiLSTM
                h, c, h_backward, c_backward = _one_directional_loop(di=1)
                hy.append(h)
                cy.append(c)

                h_backward.reverse()
                # concat
                xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in
                           zip(h_forward, h_backward)]
            else:
                # Uni-directional RNN
                xs_next = h_forward

        ys = xs_next
        hy = stack.stack(hy)
        cy = stack.stack(cy)
        return hy, cy, tuple(ys)
示例#2
0
def n_step_lstm_base(
        n_layers, dropout_ratio, hx, cx, ws, bs, xs, use_bi_direction,
        **kwargs):
    """Base function for Stack LSTM/BiLSTM functions.

    This function is used at :func:`chainer.functions.n_step_lstm` and
    :func:`chainer.functions.n_step_bilstm`.
    This function's behavior depends on following arguments,
    ``activation`` and ``use_bi_direction``.

    Args:
        n_layers(int): The number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (~chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is the number of layers and
            is equal to ``n_layers``, ``B`` is the mini-batch size, and ``N``
            is the dimension of the hidden units.
        cx (~chainer.Variable): Variable holding stacked cell states.
            It has the same shape as ``hx``.
        ws (list of list of :class:`~chainer.Variable`): Weight matrices.
            ``ws[i]`` represents the weights for the i-th layer.
            Each ``ws[i]`` is a list containing eight matrices.
            ``ws[i][j]`` corresponds to :math:`W_j` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 4`` are ``(I, N)``-shape as they
            are multiplied with input variables, where ``I`` is the size of
            the input and ``N`` is the dimension of the hidden units. All
            other matrices are ``(N, N)``-shaped.
        bs (list of list of :class:`~chainer.Variable`): Bias vectors.
            ``bs[i]`` represents the biases for the i-th layer.
            Each ``bs[i]`` is a list containing eight vectors.
            ``bs[i][j]`` corresponds to :math:`b_j` in the equation.
            The shape of each matrix is ``(N,)``.
        xs (list of :class:`~chainer.Variable`):
            A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is the
            mini-batch size for time ``t``. The sequences must be transposed.
            :func:`~chainer.functions.transpose_sequence` can be used to
            transpose a list of :class:`~chainer.Variable`\\ s each
            representing a sequence.
            When sequences has different lengths, they must be
            sorted in descending order of their lengths before transposing.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        use_bi_direction (bool): If ``True``, this function uses Bi-directional
            LSTM.

    Returns:
        tuple: This functions returns a tuple concaining three elements,
        ``hy``, ``cy`` and ``ys``.

            - ``hy`` is an updated hidden states whose shape is the same as
              ``hx``.
            - ``cy`` is an updated cell states whose shape is the same as
              ``cx``.
            - ``ys`` is a list of :class:`~chainer.Variable` . Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is
              the mini-batch size for time ``t``. Note that ``B_t`` is the same
              value as ``xs[t]``.

    .. seealso::

       :func:`chainer.functions.n_step_lstm`
       :func:`chainer.functions.n_step_bilstm`

    """

    argument.check_unexpected_kwargs(
        kwargs, train='train argument is not supported anymore. '
        'Use chainer.using_config',
        use_cudnn='use_cudnn argument is not supported anymore. '
        'Use chainer.using_config')
    argument.assert_kwargs_empty(kwargs)

    xp = cuda.get_array_module(hx, hx.data)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        lengths = [len(x) for x in xs]
        xs = chainer.functions.concat(xs, axis=0)
        # flatten all input variables
        inputs = tuple(itertools.chain(
            (hx, cx),
            itertools.chain.from_iterable(ws),
            itertools.chain.from_iterable(bs),
            (xs,)))
        if use_bi_direction:
            rnn = NStepBiLSTM
        else:
            rnn = NStepLSTM

        hy, cy, ys = rnn(n_layers, states, lengths)(*inputs)
        sections = numpy.cumsum(lengths[:-1])
        ys = chainer.functions.split_axis(ys, sections, 0)
        return hy, cy, ys

    else:
        direction = 2 if use_bi_direction else 1
        split_size = n_layers * direction
        hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]
        cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True)
        cx = [reshape.reshape(c, c.shape[1:]) for c in cx]

        xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws]
        hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws]
        xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs]
        hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs]

        xs_next = xs
        hy = []
        cy = []
        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward LSTM
                # di=1, backward LSTM
                h_list = []
                c_list = []
                layer_idx = direction * layer + di
                h = hx[layer_idx]
                c = cx[layer_idx]
                if di == 0:
                    xs_list = xs_next
                else:
                    xs_list = reversed(xs_next)
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                        c, c_rest = split_axis.split_axis(c, [batch], axis=0)
                    else:
                        h_rest = None
                        c_rest = None

                    if layer != 0:
                        x = dropout.dropout(x, ratio=dropout_ratio)
                    lstm_in = linear.linear(x, xws[layer_idx],
                                            xbs[layer_idx]) + \
                        linear.linear(h, hws[layer_idx], hbs[layer_idx])

                    c_bar, h_bar = lstm.lstm(c, lstm_in)
                    if h_rest is not None:
                        h = concat.concat([h_bar, h_rest], axis=0)
                        c = concat.concat([c_bar, c_rest], axis=0)
                    else:
                        h = h_bar
                        c = c_bar
                    h_list.append(h_bar)
                    c_list.append(c_bar)
                return h, c, h_list, c_list

            h, c, h_forward, c_forward = _one_directional_loop(di=0)
            hy.append(h)
            cy.append(c)

            if use_bi_direction:
                # BiLSTM
                h, c, h_backward, c_backward = _one_directional_loop(di=1)
                hy.append(h)
                cy.append(c)

                h_backward.reverse()
                # concat
                xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in
                           zip(h_forward, h_backward)]
            else:
                # Uni-directional RNN
                xs_next = h_forward

        ys = xs_next
        hy = stack.stack(hy)
        cy = stack.stack(cy)
        return hy, cy, tuple(ys)
示例#3
0
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction,
                    **kwargs):
    """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction)

    Base function for Stack GRU/BiGRU functions.

    This function is used at  :func:`chainer.functions.n_step_bigru` and
    :func:`chainer.functions.n_step_gru`.
    This function's behavior depends on argument ``use_bi_direction``.

    .. warning::

       ``train`` and ``use_cudnn`` arguments are not supported anymore since
       v2.
       Instead, use ``chainer.using_config('train', train)`` and
       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
       See :func:`chainer.using_config`.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimension of hidden units. Because of bi-direction, the
            first dimension length is ``2S``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing six matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing six vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimension of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this function supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        activation (str): Activation function name.
            Please select ``tanh`` or ``relu``.
        use_bi_direction (bool): If ``True``, this function uses
            Bi-direction GRU.

    .. seealso::
       :func:`chainer.functions.n_step_rnn`
       :func:`chainer.functions.n_step_birnn`

    """  # NOQA
    if kwargs:
        argument.check_unexpected_kwargs(
            kwargs,
            train='train argument is not supported anymore. '
            'Use chainer.using_config',
            use_cudnn='use_cudnn argument is not supported anymore. '
            'Use chainer.using_config')
        argument.assert_kwargs_empty(kwargs)

    xp = cuda.get_array_module(hx, hx.data)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        lengths = [len(x) for x in xs]
        xs = chainer.functions.concat(xs, axis=0)

        w = n_step_rnn.cudnn_rnn_weight_concat(n_layers, states,
                                               use_bi_direction, 'gru', ws, bs)

        if use_bi_direction:
            rnn = NStepBiGRU
        else:
            rnn = NStepGRU

        hy, ys = rnn(n_layers, states, lengths)(hx, w, xs)
        sections = numpy.cumsum(lengths[:-1])
        ys = chainer.functions.split_axis(ys, sections, 0)
        return hy, ys

    else:
        hy, _, ys = n_step_rnn.n_step_rnn_impl(_gru, n_layers, dropout_ratio,
                                               hx, None, ws, bs, xs,
                                               use_bi_direction)
        return hy, ys
示例#4
0
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction,
                    **kwargs):
    """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction)

    Base function for Stack GRU/BiGRU functions.

    This function is used at  :func:`chainer.functions.n_step_bigru` and
    :func:`chainer.functions.n_step_gru`.
    This function's behavior depends on argument ``use_bi_direction``.

    .. warning::

       ``train`` and ``use_cudnn`` arguments are not supported anymore since
       v2.
       Instead, use ``chainer.using_config('train', train)`` and
       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
       See :func:`chainer.using_config`.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimension of hidden units. Because of bi-direction, the
            first dimension length is ``2S``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing six matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing six vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimension of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        activation (str): Activation function name.
            Please select ``tanh`` or ``relu``.
        use_bi_direction (bool): If ``True``, this function uses
            Bi-direction GRU.

    .. seealso::
       :func:`chainer.functions.n_step_rnn`
       :func:`chainer.functions.n_step_birnn`

    """  # NOQA
    argument.check_unexpected_kwargs(
        kwargs,
        train='train argument is not supported anymore. '
        'Use chainer.using_config',
        use_cudnn='use_cudnn argument is not supported anymore. '
        'Use chainer.using_config')
    argument.assert_kwargs_empty(kwargs)

    xp = cuda.get_array_module(hx, hx.data)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(
            itertools.chain((hx, ), itertools.chain.from_iterable(ws),
                            itertools.chain.from_iterable(bs), xs))
        if use_bi_direction:
            rnn = NStepBiGRU(n_layers, states)
        else:
            rnn = NStepGRU(n_layers, states)

        ret = rnn(*inputs)
        hy, = ret[:1]
        ys = ret[1:]
        return hy, ys

    else:
        direction = 2 if use_bi_direction else 1
        hx = split_axis.split_axis(hx,
                                   n_layers * direction,
                                   axis=0,
                                   force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]

        xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws]
        hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws]
        xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs]
        hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs]

        xs_next = xs
        hy = []
        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward GRU
                # di=1, backward GRU
                xs_list = xs_next if di == 0 else reversed(xs_next)
                layer_idx = direction * layer + di
                h = hx[layer_idx]
                h_list = []
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                    else:
                        h_rest = None

                    if layer > 0:
                        x = dropout.dropout(x, ratio=dropout_ratio)

                    gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx])
                    gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx])

                    W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1)
                    U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1)

                    r = sigmoid.sigmoid(W_r_x + U_r_h)
                    z = sigmoid.sigmoid(W_z_x + U_z_h)
                    h_bar = tanh.tanh(W_x + r * U_x)
                    h_bar = (1 - z) * h_bar + z * h
                    if h_rest is not None:
                        h = concat.concat([h_bar, h_rest], axis=0)
                    else:
                        h = h_bar
                    h_list.append(h_bar)
                return h, h_list

            # Forward GRU
            h, h_forward = _one_directional_loop(di=0)
            hy.append(h)

            if use_bi_direction:
                # Backward GRU
                h, h_backward = _one_directional_loop(di=1)
                h_backward.reverse()
                # Concat
                xs_next = [
                    concat.concat([hfi, hbi], axis=1)
                    for (hfi, hbi) in six.moves.zip(h_forward, h_backward)
                ]
                hy.append(h)
            else:
                # Uni-directional GRU
                xs_next = h_forward

        ys = xs_next
        hy = stack.stack(hy)
        return hy, tuple(ys)
示例#5
0
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs,
                    use_bi_direction, **kwargs):
    """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction)

    Base function for Stack GRU/BiGRU functions.

    This function is used at  :func:`chainer.functions.n_step_bigru` and
    :func:`chainer.functions.n_step_gru`.
    This function's behavior depends on argument ``use_bi_direction``.

    .. warning::

       ``train`` and ``use_cudnn`` arguments are not supported anymore since
       v2.
       Instead, use ``chainer.using_config('train', train)`` and
       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
       See :func:`chainer.using_config`.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimension of hidden units. Because of bi-direction, the
            first dimension length is ``2S``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing six matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing six vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimension of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        activation (str): Activation function name.
            Please select ``tanh`` or ``relu``.
        use_bi_direction (bool): If ``True``, this function uses
            Bi-direction GRU.

    .. seealso::
       :func:`chainer.functions.n_step_rnn`
       :func:`chainer.functions.n_step_birnn`

    """  # NOQA
    argument.check_unexpected_kwargs(
        kwargs, train='train argument is not supported anymore. '
        'Use chainer.using_config',
        use_cudnn='use_cudnn argument is not supported anymore. '
        'Use chainer.using_config')
    argument.assert_kwargs_empty(kwargs)

    xp = cuda.get_array_module(hx, hx.data)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(itertools.chain(
            (hx, ),
            itertools.chain.from_iterable(ws),
            itertools.chain.from_iterable(bs),
            xs))
        if use_bi_direction:
            rnn = NStepBiGRU(n_layers, states)
        else:
            rnn = NStepGRU(n_layers, states)

        ret = rnn(*inputs)
        hy, = ret[:1]
        ys = ret[1:]
        return hy, ys

    else:
        direction = 2 if use_bi_direction else 1
        hx = split_axis.split_axis(hx, n_layers * direction, axis=0,
                                   force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]

        xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws]
        hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws]
        xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs]
        hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs]

        xs_next = xs
        hy = []
        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward GRU
                # di=1, backward GRU
                xs_list = xs_next if di == 0 else reversed(xs_next)
                layer_idx = direction * layer + di
                h = hx[layer_idx]
                h_list = []
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                    else:
                        h_rest = None

                    if layer > 0:
                        x = dropout.dropout(x, ratio=dropout_ratio)

                    gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx])
                    gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx])

                    W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1)
                    U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1)

                    r = sigmoid.sigmoid(W_r_x + U_r_h)
                    z = sigmoid.sigmoid(W_z_x + U_z_h)
                    h_bar = tanh.tanh(W_x + r * U_x)
                    h_bar = (1 - z) * h_bar + z * h
                    if h_rest is not None:
                        h = concat.concat([h_bar, h_rest], axis=0)
                    else:
                        h = h_bar
                    h_list.append(h_bar)
                return h, h_list

            # Forward GRU
            h, h_forward = _one_directional_loop(di=0)
            hy.append(h)

            if use_bi_direction:
                # Backward GRU
                h, h_backward = _one_directional_loop(di=1)
                h_backward.reverse()
                # Concat
                xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in
                           six.moves.zip(h_forward, h_backward)]
                hy.append(h)
            else:
                # Uni-directional GRU
                xs_next = h_forward

        ys = xs_next
        hy = stack.stack(hy)
        return hy, tuple(ys)
示例#6
0
def n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs,
                    use_bi_direction, **kwargs):
    """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction)

    Base function for Stack GRU/BiGRU functions.

    This function is used at  :func:`chainer.functions.n_step_bigru` and
    :func:`chainer.functions.n_step_gru`.
    This function's behavior depends on argument ``use_bi_direction``.

    .. warning::

       ``train`` and ``use_cudnn`` arguments are not supported anymore since
       v2.
       Instead, use ``chainer.using_config('train', train)`` and
       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
       See :func:`chainer.using_config`.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimension of hidden units. Because of bi-direction, the
            first dimension length is ``2S``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing six matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing six vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimension of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this function supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        activation (str): Activation function name.
            Please select ``tanh`` or ``relu``.
        use_bi_direction (bool): If ``True``, this function uses
            Bi-direction GRU.

    .. seealso::
       :func:`chainer.functions.n_step_rnn`
       :func:`chainer.functions.n_step_birnn`

    """  # NOQA
    argument.check_unexpected_kwargs(
        kwargs, train='train argument is not supported anymore. '
        'Use chainer.using_config',
        use_cudnn='use_cudnn argument is not supported anymore. '
        'Use chainer.using_config')
    argument.assert_kwargs_empty(kwargs)

    xp = cuda.get_array_module(hx, hx.data)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        lengths = [len(x) for x in xs]
        xs = chainer.functions.concat(xs, axis=0)

        w = n_step_rnn.cudnn_rnn_weight_concat(
            n_layers, states, use_bi_direction, 'gru', ws, bs)

        if use_bi_direction:
            rnn = NStepBiGRU
        else:
            rnn = NStepGRU

        hy, ys = rnn(n_layers, states, lengths)(hx, w, xs)
        sections = numpy.cumsum(lengths[:-1])
        ys = chainer.functions.split_axis(ys, sections, 0)
        return hy, ys

    else:
        hy, _, ys = n_step_rnn.n_step_rnn_impl(
            _gru, n_layers, dropout_ratio, hx, None, ws, bs, xs,
            use_bi_direction)
        return hy, ys
示例#7
0
def n_step_lstm_base(
        n_layers, dropout_ratio, hx, cx, ws, bs, xs, use_bi_direction,
        **kwargs):
    """Base function for Stack LSTM/BiLSTM functions.

    This function is used at :func:`chainer.functions.n_step_lstm` and
    :func:`chainer.functions.n_step_bilstm`.
    This function's behavior depends on following arguments,
    ``activation`` and ``use_bi_direction``.

    Args:
        n_layers(int): The number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (~chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is the number of layers and
            is equal to ``n_layers``, ``B`` is the mini-batch size, and ``N``
            is the dimension of the hidden units.
        cx (~chainer.Variable): Variable holding stacked cell states.
            It has the same shape as ``hx``.
        ws (list of list of :class:`~chainer.Variable`): Weight matrices.
            ``ws[i]`` represents the weights for the i-th layer.
            Each ``ws[i]`` is a list containing eight matrices.
            ``ws[i][j]`` corresponds to :math:`W_j` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 4`` are ``(I, N)``-shape as they
            are multiplied with input variables, where ``I`` is the size of
            the input and ``N`` is the dimension of the hidden units. All
            other matrices are ``(N, N)``-shaped.
        bs (list of list of :class:`~chainer.Variable`): Bias vectors.
            ``bs[i]`` represents the biases for the i-th layer.
            Each ``bs[i]`` is a list containing eight vectors.
            ``bs[i][j]`` corresponds to :math:`b_j` in the equation.
            The shape of each matrix is ``(N,)``.
        xs (list of :class:`~chainer.Variable`):
            A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is the
            mini-batch size for time ``t``. The sequences must be transposed.
            :func:`~chainer.functions.transpose_sequence` can be used to
            transpose a list of :class:`~chainer.Variable`\\ s each
            representing a sequence.
            When sequences has different lengths, they must be
            sorted in descending order of their lengths before transposing.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        use_bi_direction (bool): If ``True``, this function uses Bi-directional
            LSTM.

    Returns:
        tuple: This function returns a tuple containing three elements,
        ``hy``, ``cy`` and ``ys``.

            - ``hy`` is an updated hidden states whose shape is the same as
              ``hx``.
            - ``cy`` is an updated cell states whose shape is the same as
              ``cx``.
            - ``ys`` is a list of :class:`~chainer.Variable` . Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is
              the mini-batch size for time ``t``. Note that ``B_t`` is the same
              value as ``xs[t]``.

    .. seealso::

       :func:`chainer.functions.n_step_lstm`
       :func:`chainer.functions.n_step_bilstm`

    """
    if kwargs:
        argument.check_unexpected_kwargs(
            kwargs, train='train argument is not supported anymore. '
            'Use chainer.using_config',
            use_cudnn='use_cudnn argument is not supported anymore. '
            'Use chainer.using_config')
        argument.assert_kwargs_empty(kwargs)

    xp = cuda.get_array_module(hx, hx.data)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        lengths = [len(x) for x in xs]
        xs = chainer.functions.concat(xs, axis=0)

        w = n_step_rnn.cudnn_rnn_weight_concat(
            n_layers, states, use_bi_direction, 'lstm', ws, bs)

        if use_bi_direction:
            rnn = NStepBiLSTM
        else:
            rnn = NStepLSTM

        hy, cy, ys = rnn(n_layers, states, lengths)(hx, cx, w, xs)
        sections = numpy.cumsum(lengths[:-1])
        ys = chainer.functions.split_axis(ys, sections, 0)
        return hy, cy, ys

    else:
        return n_step_rnn.n_step_rnn_impl(
            _lstm, n_layers, dropout_ratio, hx, cx, ws, bs, xs,
            use_bi_direction)
示例#8
0
def n_step_lstm_base(
        n_layers, dropout_ratio, hx, cx, ws, bs, xs, use_bi_direction,
        **kwargs):
    """Base function for Stack LSTM/BiLSTM functions.

    This function is used at :func:`chainer.functions.n_step_lstm` and
    :func:`chainer.functions.n_step_bilstm`.
    This function's behavior depends on following arguments,
    ``activation`` and ``use_bi_direction``.

    Args:
        n_layers(int): The number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (~chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is the number of layers and
            is equal to ``n_layers``, ``B`` is the mini-batch size, and ``N``
            is the dimension of the hidden units.
        cx (~chainer.Variable): Variable holding stacked cell states.
            It has the same shape as ``hx``.
        ws (list of list of :class:`~chainer.Variable`): Weight matrices.
            ``ws[i]`` represents the weights for the i-th layer.
            Each ``ws[i]`` is a list containing eight matrices.
            ``ws[i][j]`` corresponds to :math:`W_j` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 4`` are ``(I, N)``-shape as they
            are multiplied with input variables, where ``I`` is the size of
            the input and ``N`` is the dimension of the hidden units. All
            other matrices are ``(N, N)``-shaped.
        bs (list of list of :class:`~chainer.Variable`): Bias vectors.
            ``bs[i]`` represents the biases for the i-th layer.
            Each ``bs[i]`` is a list containing eight vectors.
            ``bs[i][j]`` corresponds to :math:`b_j` in the equation.
            The shape of each matrix is ``(N,)``.
        xs (list of :class:`~chainer.Variable`):
            A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is the
            mini-batch size for time ``t``. The sequences must be transposed.
            :func:`~chainer.functions.transpose_sequence` can be used to
            transpose a list of :class:`~chainer.Variable`\\ s each
            representing a sequence.
            When sequences has different lengths, they must be
            sorted in descending order of their lengths before transposing.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        use_bi_direction (bool): If ``True``, this function uses Bi-directional
            LSTM.

    Returns:
        tuple: This functions returns a tuple concaining three elements,
        ``hy``, ``cy`` and ``ys``.

            - ``hy`` is an updated hidden states whose shape is the same as
              ``hx``.
            - ``cy`` is an updated cell states whose shape is the same as
              ``cx``.
            - ``ys`` is a list of :class:`~chainer.Variable` . Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is
              the mini-batch size for time ``t``. Note that ``B_t`` is the same
              value as ``xs[t]``.

    .. seealso::

       :func:`chainer.functions.n_step_lstm`
       :func:`chainer.functions.n_step_bilstm`

    """

    argument.check_unexpected_kwargs(
        kwargs, train='train argument is not supported anymore. '
        'Use chainer.using_config',
        use_cudnn='use_cudnn argument is not supported anymore. '
        'Use chainer.using_config')
    argument.assert_kwargs_empty(kwargs)

    xp = cuda.get_array_module(hx, hx.data)

    if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        lengths = [len(x) for x in xs]
        xs = chainer.functions.concat(xs, axis=0)
        # flatten all input variables
        inputs = tuple(itertools.chain(
            (hx, cx),
            itertools.chain.from_iterable(ws),
            itertools.chain.from_iterable(bs),
            (xs,)))
        if use_bi_direction:
            rnn = NStepBiLSTM
        else:
            rnn = NStepLSTM

        hy, cy, ys = rnn(n_layers, states, lengths)(*inputs)
        sections = numpy.cumsum(lengths[:-1])
        ys = chainer.functions.split_axis(ys, sections, 0)
        return hy, cy, ys

    else:
        return n_step_rnn.n_step_rnn_impl(
            _lstm, n_layers, dropout_ratio, hx, cx, ws, bs, xs,
            use_bi_direction)
示例#9
0
def n_step_lstm_base(n_layers, dropout_ratio, hx, cx, ws, bs, xs, train,
                     use_cudnn, use_bi_direction):
    """Base function for Stack LSTM/BiLSTM functions.

    This function is used at :func:`chainer.functions.n_step_lstm` and
    :func:`chainer.functions.n_step_bilstm`.
    This function's behavior depends on following arguments,
    ``activation`` and ``use_bi_direction``.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimention of hidden units.
        cx (chainer.Variable): Variable holding stacked cell states.
            It has the same shape as ``hx``.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing eight matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 4`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing eight vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimention of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        train (bool): If ``True``, this function executes dropout.
        use_cudnn (bool): If ``True``, this function uses cuDNN if available.
        use_bi_direction (bool): If ``True``, this function uses Bi-directional
            LSTM.

    Returns:
        tuple: This functions returns a tuple concaining three elements,
            ``hy``, ``cy`` and ``ys``.
            - ``hy`` is an updated hidden states whose shape is same as ``hx``.
            - ``cy`` is an updated cell states whose shape is same as ``cx``.
            - ``ys`` is a list of :class:`~chainer.Variable` . Each element
              ``ys[t]`` holds hidden states of the last layer corresponding
              to an input ``xs[t]``. Its shape is ``(B_t, N)`` where ``B_t`` is
              mini-batch size for time ``t``, and ``N`` is size of hidden
              units. Note that ``B_t`` is the same value as ``xs[t]``.

    .. seealso::

       :func:`chainer.functions.n_step_lstm`
       :func:`chainer.functions.n_step_bilstm`

    """

    xp = cuda.get_array_module(hx, hx.data)

    if use_cudnn and xp is not numpy and cuda.cudnn_enabled and \
       _cudnn_version >= 5000:
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(
            itertools.chain((hx, cx), itertools.chain.from_iterable(ws),
                            itertools.chain.from_iterable(bs), xs))
        if use_bi_direction:
            rnn = NStepBiLSTM(n_layers, states, train=train)
        else:
            rnn = NStepLSTM(n_layers, states, train=train)

        ret = rnn(*inputs)
        hy, cy = ret[:2]
        ys = ret[2:]
        return hy, cy, ys

    else:
        direction = 2 if use_bi_direction else 1
        split_size = n_layers * direction
        hx = split_axis.split_axis(hx, split_size, axis=0, force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]
        cx = split_axis.split_axis(cx, split_size, axis=0, force_tuple=True)
        cx = [reshape.reshape(c, c.shape[1:]) for c in cx]

        xws = [_stack_weight([w[2], w[0], w[1], w[3]]) for w in ws]
        hws = [_stack_weight([w[6], w[4], w[5], w[7]]) for w in ws]
        xbs = [_stack_weight([b[2], b[0], b[1], b[3]]) for b in bs]
        hbs = [_stack_weight([b[6], b[4], b[5], b[7]]) for b in bs]

        xs_next = xs
        hy = []
        cy = []
        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward LSTM
                # di=1, backward LSTM
                h_list = []
                c_list = []
                layer_idx = direction * layer + di
                h = hx[layer_idx]
                c = cx[layer_idx]
                if di == 0:
                    xs_list = xs_next
                else:
                    xs_list = reversed(xs_next)
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                        c, c_rest = split_axis.split_axis(c, [batch], axis=0)
                    else:
                        h_rest = None
                        c_rest = None

                    if layer != 0:
                        x = dropout.dropout(x,
                                            ratio=dropout_ratio,
                                            train=train)
                    lstm_in = linear.linear(x, xws[layer_idx],
                                            xbs[layer_idx]) + \
                        linear.linear(h, hws[layer_idx], hbs[layer_idx])

                    c_bar, h_bar = lstm.lstm(c, lstm_in)
                    if h_rest is not None:
                        h = concat.concat([h_bar, h_rest], axis=0)
                        c = concat.concat([c_bar, c_rest], axis=0)
                    else:
                        h = h_bar
                        c = c_bar
                    h_list.append(h_bar)
                    c_list.append(c_bar)
                return h, c, h_list, c_list

            h, c, h_forward, c_forward = _one_directional_loop(di=0)
            hy.append(h)
            cy.append(c)

            if use_bi_direction:
                # BiLSTM
                h, c, h_backward, c_backward = _one_directional_loop(di=1)
                hy.append(h)
                cy.append(c)

                h_backward.reverse()
                # concat
                xs_next = [
                    concat.concat([hfi, hbi], axis=1)
                    for (hfi, hbi) in zip(h_forward, h_backward)
                ]
            else:
                # Uni-directional RNN
                xs_next = h_forward

        ys = xs_next
        hy = stack.stack(hy)
        cy = stack.stack(cy)
        return hy, cy, tuple(ys)
示例#10
0
def n_step_gru_base(n_layers, dropout_ratio, h0, hx, ws, bs, xs, W, B,
                    use_bi_direction, **kwargs):
    """n_step_gru_base(n_layers, dropout_ratio, hx, ws, bs, xs, use_bi_direction)

    Base function for Stack GRU/BiGRU functions.

    This function is used at  :func:`chainer.functions.n_step_bigru` and
    :func:`chainer.functions.n_step_gru`.
    This function's behavior depends on argument ``use_bi_direction``.

    .. warning::

       ``train`` and ``use_cudnn`` arguments are not supported anymore since
       v2.
       Instead, use ``chainer.using_config('train', train)`` and
       ``chainer.using_config('use_cudnn', use_cudnn)`` respectively.
       See :func:`chainer.using_config`.

    Args:
        n_layers(int): Number of layers.
        dropout_ratio(float): Dropout ratio.
        hx (chainer.Variable): Variable holding stacked hidden states.
            Its shape is ``(S, B, N)`` where ``S`` is number of layers and is
            equal to ``n_layers``, ``B`` is mini-batch size, and ``N`` is
            dimention of hidden units.
        ws (list of list of chainer.Variable): Weight matrices. ``ws[i]``
            represents weights for i-th layer.
            Each ``ws[i]`` is a list containing six matrices.
            ``ws[i][j]`` is corresponding with ``W_j`` in the equation.
            Only ``ws[0][j]`` where ``0 <= j < 3`` is ``(I, N)`` shape as they
            are multiplied with input variables. All other matrices has
            ``(N, N)`` shape.
        bs (list of list of chainer.Variable): Bias vectors. ``bs[i]``
            represnents biases for i-th layer.
            Each ``bs[i]`` is a list containing six vectors.
            ``bs[i][j]`` is corresponding with ``b_j`` in the equation.
            Shape of each matrix is ``(N,)`` where ``N`` is dimention of
            hidden units.
        xs (list of chainer.Variable): A list of :class:`~chainer.Variable`
            holding input values. Each element ``xs[t]`` holds input value
            for time ``t``. Its shape is ``(B_t, I)``, where ``B_t`` is
            mini-batch size for time ``t``, and ``I`` is size of input units.
            Note that this functions supports variable length sequences.
            When sequneces has different lengths, sort sequences in descending
            order by length, and transpose the sorted sequence.
            :func:`~chainer.functions.transpose_sequence` transpose a list
            of :func:`~chainer.Variable` holding sequence.
            So ``xs`` needs to satisfy
            ``xs[t].shape[0] >= xs[t + 1].shape[0]``.
        activation (str): Activation function name.
            Please select ``tanh`` or ``relu``.
        use_bi_direction (bool): If ``True``, this function uses
            Bi-direction GRU.

    .. seealso::
       :func:`chainer.functions.n_step_rnn`
       :func:`chainer.functions.n_step_birnn`

    """  # NOQA
    argument.check_unexpected_kwargs(
        kwargs,
        train='train argument is not supported anymore. '
        'Use chainer.using_config',
        use_cudnn='use_cudnn argument is not supported anymore. '
        'Use chainer.using_config')
    argument.assert_kwargs_empty(kwargs)

    #xp = cuda.get_array_module(hx, hx.data)

    if False:
        #if xp is not numpy and chainer.should_use_cudnn('>=auto', 5000):
        states = get_random_state().create_dropout_states(dropout_ratio)
        # flatten all input variables
        inputs = tuple(
            itertools.chain((hx, ), itertools.chain.from_iterable(ws),
                            itertools.chain.from_iterable(bs), xs))
        if use_bi_direction:
            rnn = NStepBiGRU(n_layers, states)
        else:
            rnn = NStepGRU(n_layers, states)

        ret = rnn(*inputs)
        hy, = ret[:1]
        ys = ret[1:]
        return hy, ys

    else:
        direction = 2 if use_bi_direction else 1
        '''
        hx = split_axis.split_axis(hx, n_layers * direction, axis=0,
                                   force_tuple=True)
        hx = [reshape.reshape(h, h.shape[1:]) for h in hx]
        '''

        xws = [concat.concat([w[0], w[1], w[2]], axis=0) for w in ws]
        hws = [concat.concat([w[3], w[4], w[5]], axis=0) for w in ws]
        xbs = [concat.concat([b[0], b[1], b[2]], axis=0) for b in bs]
        hbs = [concat.concat([b[3], b[4], b[5]], axis=0) for b in bs]

        W1, W2, W3 = W
        B1, B2, B3 = B

        #hx:encoderの最上層の隠れ状態ベクトルの集合
        ht = hx
        '''
        print(type(ht), len(ht))
        print(type(ht[0]), len(ht[0]))
        print(type(ht[0][0]), len(ht[0][0]))
        
        print(type(W1), len(W1))
        print(type(W1[0]), len(W1[0]))
        '''
        phi_ht = list(map(lambda x: linear.linear(x, W1, B1), ht))

        xs_next = xs

        for layer in six.moves.range(n_layers):

            def _one_directional_loop(di):
                # di=0, forward GRU
                # di=1, backward GRU
                xs_list = xs_next if di == 0 else reversed(xs_next)
                layer_idx = direction * layer + di
                h = h0[layer_idx]

                # h:d_bar_s_1
                # h_bar:d_s
                '''
                print(len(xs_list))
                print(len(xs_list[0]))
                print(len(xs_list[0][0]))
                '''
                h_list = []
                h_bar_list = []
                c_s_list = []
                z_s_list = []
                for x in xs_list:
                    batch = x.shape[0]
                    if h.shape[0] > batch:
                        h, h_rest = split_axis.split_axis(h, [batch], axis=0)
                    else:
                        h_rest = None

                    if layer > 0:
                        x = dropout.dropout(x, ratio=dropout_ratio)

                    gru_x = linear.linear(x, xws[layer_idx], xbs[layer_idx])
                    gru_h = linear.linear(h, hws[layer_idx], hbs[layer_idx])

                    W_r_x, W_z_x, W_x = split_axis.split_axis(gru_x, 3, axis=1)
                    U_r_h, U_z_h, U_x = split_axis.split_axis(gru_h, 3, axis=1)

                    r = sigmoid.sigmoid(W_r_x + U_r_h)
                    z = sigmoid.sigmoid(W_z_x + U_z_h)
                    h_bar = tanh.tanh(W_x + r * U_x)
                    h_bar = (1 - z) * h_bar + z * h

                    phi_d = linear.linear(h_bar, W2, B2)
                    '''
                    print(type(phi_ht), len(phi_ht))
                    print(type(phi_ht[0]), len(phi_ht[0]))
                    print(type(phi_ht[0][0]), len(phi_ht[0][0]))
                    
                    print(type(phi_d), len(phi_d))
                    print(type(phi_d[0]), len(phi_d[0]), phi_d[0].shape)
                    '''
                    #phi_ht_len = [t.shape[1] for t in phi_ht]
                    #phi_ht_section = np.cumsum(phi_ht_len[:-1])
                    #concat_phi_ht  = F.concat(phi_ht, axis=1)
                    #concat_phi_d = [F.concat([phi_d[i]]*phi_ht_len[i], axis=0) for i in range(batch)]
                    #concat_phi_d = F.concat(concat_phi_d, axis=0)
                    #concat_phi_d = F.concat(F.transpose(phi_d), axis=0)

                    u_st = list(
                        map(
                            lambda x, y: reshape.reshape((linear.linear(
                                x, reshape.reshape(y, (1, len(y))))),
                                                         (len(x), )), phi_ht,
                            phi_d))  #(4)

                    sum_u = list(map(F.sum, u_st))
                    alpha_st = list(
                        map(lambda x, y: x / F.broadcast_to(y, x.shape), u_st,
                            sum_u))  #(3)
                    z_s = list(map(F.argmax, alpha_st))
                    z_s = list(map(lambda x: F.broadcast_to(x, (1, )), z_s))
                    z_s = F.concat(z_s, axis=0)
                    '''
                    print(type(alpha_st),len(alpha_st))
                    print(type(alpha_st[0]),len(alpha_st[0]))
                    
                    print(alpha_st[0].shape)
                    print(ht[0].shape)
                    '''
                    c_s = list(
                        map(
                            lambda x, y: F.sum(F.broadcast_to(
                                reshape.reshape(x,
                                                (x.shape[0], 1)), y.shape) * y,
                                               axis=0), alpha_st, ht))  #(2)

                    c_s_2d = list(
                        map(lambda x: reshape.reshape(x, (1, len(x))), c_s))
                    concat_c_s = F.concat(c_s_2d, axis=0)

                    c_s = list(
                        map(lambda x: F.broadcast_to(x, (1, len(x))), c_s))
                    c_s = F.concat(c_s, axis=0)
                    '''
                    print(type(c_s), len(c_s))
                    print(type(c_s[0]), len(c_s[0]), c_s[0].shape)
                    '''
                    h = F.relu(
                        linear.linear(F.concat([concat_c_s, h_bar], axis=1),
                                      W3, B3))

                    h_list.append(h)
                    h_bar_list.append(h_bar)
                    c_s_list.append(c_s)
                    z_s_list.append(z_s)

                    #単語数の違いを担保
                    if h_rest is not None:
                        h = concat.concat([h, h_rest], axis=0)
                        h_bar = concat.concat([h_bar, h_rest], axis=0)

                return h_list, h_bar_list, c_s_list, z_s_list

            # Forward GRU
            hs, h_bars, c_ss, z_ss = _one_directional_loop(di=0)
            '''
            if use_bi_direction:
                # Backward GRU
                h, h_backward = _one_directional_loop(di=1)
                h_backward.reverse()
                # Concat
                xs_next = [concat.concat([hfi, hbi], axis=1) for (hfi, hbi) in
                           six.moves.zip(h_forward, h_backward)]
                hy.append(h)
            else:
                # Uni-directional GRU
                xs_next = h_forward
             '''

        #ys = xs_next
        #hy = stack.stack(hy)
        #return hy, tuple(ys)
        return hs, h_bars, c_ss, z_ss