Exemplo n.º 1
0
def test_positionwise_feedforward():
    model_dim = 20
    inner_dim = 30

    a = C.sequence.input_variable(10)
    b = PositionwiseFeedForward(model_dim, inner_dim, 0.1)(a)

    assert b.shape == (model_dim, )

    n1 = np.random.random((3, 10)).astype(np.float32)
    n2 = np.random.random((6, 10)).astype(np.float32)

    b.eval({a: [n1, n2]})
Exemplo n.º 2
0
def TransformerDecoderBlock(num_heads: int, model_dim: int, intermediate_dim: int, dropout_rate: float = None,
                            obey_sequence_order: bool = True, max_seq_len: int = None,
                            mha1_key_init=default_override_or(C.glorot_uniform()), mha1_key_init_bias=default_override_or(0),
                            mha1_query_init=default_override_or(C.glorot_uniform()), mha1_query_init_bias=default_override_or(0),
                            mha1_value_init=default_override_or(C.glorot_uniform()), mha1_value_init_bias=default_override_or(0),
                            mha1_init=default_override_or(C.glorot_uniform()), mha1_init_bias=default_override_or(0),
                            mha1_initial_scale=1, mha1_initial_bias=0,
                            mha2_key_init=default_override_or(C.glorot_uniform()), mha2_key_init_bias=default_override_or(0),
                            mha2_query_init=default_override_or(C.glorot_uniform()), mha2_query_init_bias=default_override_or(0),
                            mha2_value_init=default_override_or(C.glorot_uniform()), mha2_value_init_bias=default_override_or(0),
                            mha2_init=default_override_or(C.glorot_uniform()), mha2_init_bias=default_override_or(0),
                            mha2_initial_scale=1, mha2_initial_bias=0,
                            intermediate_init=default_override_or(C.glorot_uniform()),
                            intermediate_init_bias=default_override_or(0),
                            init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0),
                            initial_scale=1, initial_bias=0):
    """ Decoder block of transformer as described in "Attention is all you need", https://arxiv.org/abs/1706.03762

    Consist of 2 multi head attention followed by a dense layer, residual connect and layer norm

    Arguments:
        num_heads (int): number of attention heads
        model_dim (int): number of hidden dim in final output of multi-head attention
        intermediate_dim (int): hidden/ intermediate dimension within position-wise feed-forward layer
        dropout_rate (float): probability of dropping out an element in the position-wise feed-forward
        obey_sequence_order (bool, defaults True): do not let attention peek into future values
        max_seq_len (int): max sequence length possible, used to ensure that sequence order is obeyed
        mha1_key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        mha1_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta
        mha2_key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        mha2_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta
        intermediate_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        intermediate_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta

    Returns:
        :class:`~cntk.ops.functions.Function`:

    """
    mha_block1 = MultiHeadAttentionBlock(num_heads=num_heads, model_dim=model_dim,
                                         obey_sequence_order=obey_sequence_order, max_seq_len=max_seq_len,
                                         key_init=mha1_key_init, key_init_bias=mha1_key_init_bias,
                                         query_init=mha1_query_init, query_init_bias=mha1_query_init_bias,
                                         value_init=mha1_value_init, value_init_bias=mha1_value_init_bias,
                                         init=mha1_init, init_bias=mha1_init_bias,
                                         initial_scale=mha1_initial_scale, initial_bias=mha1_initial_bias)
    
    mha_block2 = MultiHeadAttentionBlock(num_heads=num_heads, model_dim=model_dim,
                                         obey_sequence_order=False, max_seq_len=None,
                                         key_init=mha2_key_init, key_init_bias=mha2_key_init_bias,
                                         query_init=mha2_query_init, query_init_bias=mha2_query_init_bias,
                                         value_init=mha2_value_init, value_init_bias=mha2_value_init_bias,
                                         init=mha2_init, init_bias=mha2_init_bias,
                                         initial_scale=mha2_initial_scale, initial_bias=mha2_initial_bias)

    feed_foward = PositionwiseFeedForward(model_dim, intermediate_dim, dropout_rate=dropout_rate,
                                          intermediate_init=intermediate_init, intermediate_init_bias=intermediate_init_bias,
                                          init=init, init_bias=init_bias)

    layernorm = LayerNormalization(initial_scale, initial_bias)

    @C.Function
    def block(encoded, x):
        inner = mha_block1(x, x, x)
        inner = mha_block2(inner, encoded, encoded)
        output = layernorm(ResNetBlock(feed_foward)(inner))
        return output

    return block
Exemplo n.º 3
0
def TransformerEncoderBlock(num_heads: int, model_dim: int, intermediate_dim: int, dropout_rate: float = None,
                            obey_sequence_order: bool = None, max_seq_len: int = None,
                            key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0),
                            query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0),
                            value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0),
                            mha_init=default_override_or(C.glorot_uniform()), mha_init_bias=default_override_or(0),
                            mha_initial_scale=1, mha_initial_bias=0,
                            intermediate_init=default_override_or(C.glorot_uniform()), intermediate_init_bias=default_override_or(0),
                            init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0),
                            initial_scale=1, initial_bias=0, name=''):
    """ Encoder block of transformer as described in "Attention is all you need", https://arxiv.org/abs/1706.03762

    Consist of 1 multi head attention followed by a dense layer, residual connect and layer norm

    Arguments:
        num_heads (int): number of attention heads
        model_dim (int): number of hidden dim in final output of multi-head attention
        intermediate_dim (int): hidden/ intermediate dimension within position-wise feed-forward layer
        dropout_rate (float): probability of dropping out an element in the position-wise feed-forward
        obey_sequence_order: do not let attention peek into future values
        max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed
        key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
         mha_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        mha_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta
        intermediate_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        intermediate_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta

    Returns:
        :class:`~cntk.ops.functions.Function`:

    """
    mha_block = MultiHeadAttentionBlock(num_heads, model_dim, obey_sequence_order, max_seq_len,
                                        key_init=key_init, key_init_bias=key_init_bias,
                                        query_init=query_init, query_init_bias=query_init_bias,
                                        value_init=value_init, value_init_bias=value_init_bias,
                                        init=mha_init, init_bias=mha_init_bias,
                                        initial_scale=mha_initial_scale, initial_bias=mha_initial_bias,
                                        name='SelfAttention')

    feed_foward = PositionwiseFeedForward(model_dim, intermediate_dim, dropout_rate=dropout_rate,
                                          intermediate_init=intermediate_init, intermediate_init_bias=intermediate_init_bias,
                                          init=init, init_bias=init_bias, name='PWFF')

    layernorm = LayerNormalization(initial_scale, initial_bias, name='LayerNorm')

    @C.Function
    def block(x):
        self_attended = mha_block(x, C.alias(x), C.alias(x))
        hidden = feed_foward(self_attended)
        output = layernorm(hidden + self_attended)  # residual connection
        return output

    return _inject_name(block, name)  # consider change to BlockFunction