def multihead_attention_test_helper(self_attention, cache):
            paddle.seed(2020)
            paddle.framework.random._manual_program_seed(2020)
            # self_attention|cross_attention, cache|No cache
            with fluid.dygraph.guard(fluid.CPUPlace()):

                # generate params for multi_head_attention
                batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params(
                    "attn", self_attention)
                for attn_mask_type in ['int64', 'float64']:
                    query, key, value, attn_mask, cache_dict = generate_query_key_value_cache(
                        self_attention, batch_size, num_heads, query_length,
                        embed_dim, attn_mask_type, key_length, value_length,
                        kdim, vdim, cache)
                    if cache and self_attention:
                        attn_mask = np.concatenate((attn_mask, attn_mask),
                                                   axis=3)
                    need_weight, param_attr, bias_attr = False, None, None
                    # call paddle's function
                    multi_head_attn = MultiHeadAttention(
                        embed_dim, num_heads, attn_dropout, kdim, vdim,
                        need_weight, param_attr, bias_attr)
                    # construct cache object
                    cache_obj = None
                    if cache_dict:
                        if 'k' and 'v' in cache_dict:
                            cache_obj = multi_head_attn.Cache(
                                paddle.to_tensor(cache_dict['k']),
                                paddle.to_tensor(cache_dict['v']))
                        elif 'static_k' and 'static_v' in cache_dict:
                            cache_obj = multi_head_attn.StaticCache(
                                paddle.to_tensor(cache_dict['static_k']),
                                paddle.to_tensor(cache_dict['static_v']))
                    if attn_mask is not None:
                        attn_output = multi_head_attn(
                            paddle.to_tensor(query), paddle.to_tensor(key),
                            paddle.to_tensor(value),
                            paddle.to_tensor(attn_mask), cache_obj)
                    else:
                        attn_output = multi_head_attn(paddle.to_tensor(query),
                                                      paddle.to_tensor(key),
                                                      paddle.to_tensor(value),
                                                      attn_mask, cache_obj)
                    attn_output = attn_output[0] if cache_dict else attn_output

                    # implementation by numpy
                    # compute q, k, v
                    q, k, v, _ = prepare_qkv(query, key, value, num_heads,
                                             embed_dim, self_attention,
                                             multi_head_attn, cache_dict)
                    # scale dot product attention
                    attn_heads = scaled_dot_product_attention(
                        q, k, v, embed_dim // num_heads, attn_mask,
                        multi_head_attn)
                    out_proj_weight = multi_head_attn.out_proj.weight.numpy()
                    reference = fc(attn_heads, out_proj_weight)

                    np.testing.assert_allclose(attn_output.numpy(),
                                               reference,
                                               atol=1e-6)
예제 #2
0
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1,
                 activation="relu", normalize_before=False,
                 weight_attr=None, bias_attr=None):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerDecoderLayer, self).__init__()
        self.normalize_before = normalize_before

        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)

        self.nhead = nhead
        self.self_attn = MultiHeadAttention(d_model, nhead, dropout=dropout,
                                            weight_attr=weight_attrs[0], bias_attr=bias_attrs[0])
        self.multihead_attn = MultiHeadAttention(d_model, nhead, dropout=dropout,
                                             weight_attr=weight_attrs[1], bias_attr=bias_attrs[1])
        self.linear1 = Linear(d_model, dim_feedforward, 
                              weight_attrs[2], bias_attr=bias_attrs[2])
        self.dropout = Dropout(dropout, mode="upscale_in_train")
        self.linear2 = Linear(dim_feedforward, d_model, 
                              weight_attrs[2], bias_attr=bias_attrs[2])
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
        self.dropout3 = Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)
    def test_transformer_encoder_layer_attr_1(self):
        with fluid.dygraph.guard(fluid.CPUPlace()):
            paddle.framework.seed(2020)
            paddle.framework.random._manual_program_seed(2020)

            ffn_fc1_act = "relu"
            # 1.generate basic params
            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
                mode="encoder_layer")
            # 2.generate input for encoder
            src = np.random.rand(batch_size, sequence_length,
                                 d_model).astype("float32")
            src_mask = np.zeros((batch_size, n_head, sequence_length,
                                 sequence_length)).astype("float32")
            src_mask[0][0][0][0] = -np.inf

            for cache in [True, False]:
                # paddle
                encoder_layer = TransformerEncoderLayer(
                    d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
                    attn_dropout, act_dropout)
                cache_objs = None
                if cache:
                    cache_objs = encoder_layer.gen_cache(paddle.to_tensor(src))

                encoder_output = encoder_layer(paddle.to_tensor(src),
                                               paddle.to_tensor(src_mask),
                                               cache_objs)
                encoder_output = encoder_output[0].numpy(
                ) if cache else encoder_output.numpy()

                # 4.numpy:
                residual = src
                # paddle self attention
                self_attn = MultiHeadAttention(d_model,
                                               n_head,
                                               dropout=attn_dropout)
                attn_output = self_attn(paddle.to_tensor(src),
                                        paddle.to_tensor(src),
                                        paddle.to_tensor(src),
                                        paddle.to_tensor(src_mask), cache_objs)
                attn_output = attn_output[0].numpy(
                ) if cache else attn_output.numpy()

                src = attn_output + residual
                src_norm = layer_norm(src, d_model, encoder_layer.norm1)
                residual = src_norm

                ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act)
                src = residual + ffn_output
                src = layer_norm(src, d_model, encoder_layer.norm2)

                np.testing.assert_allclose(encoder_output,
                                           src,
                                           rtol=1e-5,
                                           atol=1e-6)
예제 #4
0
    def test_transformer_decoder_layer(self):
        with fluid.dygraph.guard(fluid.CPUPlace()):
            paddle.framework.manual_seed(2020)
            activation = "relu"
            normalize_before = False
            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params(
                mode="decoder_layer")
            tgt = np.random.rand(batch_size, target_length,
                                 d_model).astype("float32")
            memory = np.random.rand(batch_size, source_length,
                                    d_model).astype("float32")
            tgt_mask = np.zeros((batch_size, n_head, target_length,
                                 target_length)).astype("float32")
            tgt_mask[0][0][0][0] = -1e9
            memory_mask = np.zeros((batch_size, n_head, target_length,
                                    source_length)).astype("float32")
            memory_mask[0][0][0][0] = -1e9
            for cache in [True, False]:
                self_attn = MultiHeadAttention(d_model,
                                               n_head,
                                               dropout=attn_dropout)
                cross_attn = MultiHeadAttention(d_model,
                                                n_head,
                                                dropout=attn_dropout)

                # paddle decoderlayer:
                decoder_layer = TransformerDecoderLayer(
                    d_model, n_head, dim_feedforward, dropout, activation,
                    attn_dropout, act_dropout, normalize_before)
                cache_objs = None
                if cache:
                    cache_objs = decoder_layer.gen_cache(
                        paddle.to_variable(memory))

                decoder_output = decoder_layer(paddle.to_variable(tgt),
                                               paddle.to_variable(memory),
                                               paddle.to_variable(tgt_mask),
                                               paddle.to_variable(memory_mask),
                                               cache_objs)

                decoder_output = decoder_output[0].numpy(
                ) if cache else decoder_output.numpy()

                # numpy:
                residual = tgt
                # self-attn
                self_attn_cache = cache_objs[
                    0] if cache_objs is not None else None
                tgt = self_attn(paddle.to_variable(tgt),
                                paddle.to_variable(tgt),
                                paddle.to_variable(tgt),
                                paddle.to_variable(tgt_mask), self_attn_cache)

                tgt = tgt[0].numpy() if cache else tgt.numpy()

                tgt = residual + tgt
                # postprocess
                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1)
                residual = tgt_norm
                # cross-attn
                cross_attn_cache = cache_objs[
                    1] if cache_objs is not None else None
                tgt = cross_attn(paddle.to_variable(tgt_norm),
                                 paddle.to_variable(memory),
                                 paddle.to_variable(memory),
                                 paddle.to_variable(memory_mask),
                                 cross_attn_cache)
                tgt = tgt[0].numpy() if cache else tgt.numpy()

                # postprocess
                tgt = tgt + residual
                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2)
                residual = tgt_norm
                # FFN
                ffn_output = ffn(tgt_norm, decoder_layer, activation)
                # post process
                tgt = residual + ffn_output
                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3)

                np.testing.assert_allclose(decoder_output,
                                           tgt_norm,
                                           rtol=1e-5,
                                           atol=1e-6)