def multihead_attention_test_helper(self_attention, cache): paddle.seed(2020) paddle.framework.random._manual_program_seed(2020) # self_attention|cross_attention, cache|No cache with fluid.dygraph.guard(fluid.CPUPlace()): # generate params for multi_head_attention batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params( "attn", self_attention) for attn_mask_type in ['int64', 'float64']: query, key, value, attn_mask, cache_dict = generate_query_key_value_cache( self_attention, batch_size, num_heads, query_length, embed_dim, attn_mask_type, key_length, value_length, kdim, vdim, cache) if cache and self_attention: attn_mask = np.concatenate((attn_mask, attn_mask), axis=3) need_weight, param_attr, bias_attr = False, None, None # call paddle's function multi_head_attn = MultiHeadAttention( embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight, param_attr, bias_attr) # construct cache object cache_obj = None if cache_dict: if 'k' and 'v' in cache_dict: cache_obj = multi_head_attn.Cache( paddle.to_tensor(cache_dict['k']), paddle.to_tensor(cache_dict['v'])) elif 'static_k' and 'static_v' in cache_dict: cache_obj = multi_head_attn.StaticCache( paddle.to_tensor(cache_dict['static_k']), paddle.to_tensor(cache_dict['static_v'])) if attn_mask is not None: attn_output = multi_head_attn( paddle.to_tensor(query), paddle.to_tensor(key), paddle.to_tensor(value), paddle.to_tensor(attn_mask), cache_obj) else: attn_output = multi_head_attn(paddle.to_tensor(query), paddle.to_tensor(key), paddle.to_tensor(value), attn_mask, cache_obj) attn_output = attn_output[0] if cache_dict else attn_output # implementation by numpy # compute q, k, v q, k, v, _ = prepare_qkv(query, key, value, num_heads, embed_dim, self_attention, multi_head_attn, cache_dict) # scale dot product attention attn_heads = scaled_dot_product_attention( q, k, v, embed_dim // num_heads, attn_mask, multi_head_attn) out_proj_weight = multi_head_attn.out_proj.weight.numpy() reference = fc(attn_heads, out_proj_weight) np.testing.assert_allclose(attn_output.numpy(), reference, atol=1e-6)
def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="relu", normalize_before=False, weight_attr=None, bias_attr=None): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerDecoderLayer, self).__init__() self.normalize_before = normalize_before weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) self.nhead = nhead self.self_attn = MultiHeadAttention(d_model, nhead, dropout=dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0]) self.multihead_attn = MultiHeadAttention(d_model, nhead, dropout=dropout, weight_attr=weight_attrs[1], bias_attr=bias_attrs[1]) self.linear1 = Linear(d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2]) self.dropout = Dropout(dropout, mode="upscale_in_train") self.linear2 = Linear(dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2]) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(dropout, mode="upscale_in_train") self.dropout2 = Dropout(dropout, mode="upscale_in_train") self.dropout3 = Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation)
def test_transformer_encoder_layer_attr_1(self): with fluid.dygraph.guard(fluid.CPUPlace()): paddle.framework.seed(2020) paddle.framework.random._manual_program_seed(2020) ffn_fc1_act = "relu" # 1.generate basic params batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params( mode="encoder_layer") # 2.generate input for encoder src = np.random.rand(batch_size, sequence_length, d_model).astype("float32") src_mask = np.zeros((batch_size, n_head, sequence_length, sequence_length)).astype("float32") src_mask[0][0][0][0] = -np.inf for cache in [True, False]: # paddle encoder_layer = TransformerEncoderLayer( d_model, n_head, dim_feedforward, dropout, ffn_fc1_act, attn_dropout, act_dropout) cache_objs = None if cache: cache_objs = encoder_layer.gen_cache(paddle.to_tensor(src)) encoder_output = encoder_layer(paddle.to_tensor(src), paddle.to_tensor(src_mask), cache_objs) encoder_output = encoder_output[0].numpy( ) if cache else encoder_output.numpy() # 4.numpy: residual = src # paddle self attention self_attn = MultiHeadAttention(d_model, n_head, dropout=attn_dropout) attn_output = self_attn(paddle.to_tensor(src), paddle.to_tensor(src), paddle.to_tensor(src), paddle.to_tensor(src_mask), cache_objs) attn_output = attn_output[0].numpy( ) if cache else attn_output.numpy() src = attn_output + residual src_norm = layer_norm(src, d_model, encoder_layer.norm1) residual = src_norm ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act) src = residual + ffn_output src = layer_norm(src, d_model, encoder_layer.norm2) np.testing.assert_allclose(encoder_output, src, rtol=1e-5, atol=1e-6)
def test_transformer_decoder_layer(self): with fluid.dygraph.guard(fluid.CPUPlace()): paddle.framework.manual_seed(2020) activation = "relu" normalize_before = False batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params( mode="decoder_layer") tgt = np.random.rand(batch_size, target_length, d_model).astype("float32") memory = np.random.rand(batch_size, source_length, d_model).astype("float32") tgt_mask = np.zeros((batch_size, n_head, target_length, target_length)).astype("float32") tgt_mask[0][0][0][0] = -1e9 memory_mask = np.zeros((batch_size, n_head, target_length, source_length)).astype("float32") memory_mask[0][0][0][0] = -1e9 for cache in [True, False]: self_attn = MultiHeadAttention(d_model, n_head, dropout=attn_dropout) cross_attn = MultiHeadAttention(d_model, n_head, dropout=attn_dropout) # paddle decoderlayer: decoder_layer = TransformerDecoderLayer( d_model, n_head, dim_feedforward, dropout, activation, attn_dropout, act_dropout, normalize_before) cache_objs = None if cache: cache_objs = decoder_layer.gen_cache( paddle.to_variable(memory)) decoder_output = decoder_layer(paddle.to_variable(tgt), paddle.to_variable(memory), paddle.to_variable(tgt_mask), paddle.to_variable(memory_mask), cache_objs) decoder_output = decoder_output[0].numpy( ) if cache else decoder_output.numpy() # numpy: residual = tgt # self-attn self_attn_cache = cache_objs[ 0] if cache_objs is not None else None tgt = self_attn(paddle.to_variable(tgt), paddle.to_variable(tgt), paddle.to_variable(tgt), paddle.to_variable(tgt_mask), self_attn_cache) tgt = tgt[0].numpy() if cache else tgt.numpy() tgt = residual + tgt # postprocess tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1) residual = tgt_norm # cross-attn cross_attn_cache = cache_objs[ 1] if cache_objs is not None else None tgt = cross_attn(paddle.to_variable(tgt_norm), paddle.to_variable(memory), paddle.to_variable(memory), paddle.to_variable(memory_mask), cross_attn_cache) tgt = tgt[0].numpy() if cache else tgt.numpy() # postprocess tgt = tgt + residual tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2) residual = tgt_norm # FFN ffn_output = ffn(tgt_norm, decoder_layer, activation) # post process tgt = residual + ffn_output tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3) np.testing.assert_allclose(decoder_output, tgt_norm, rtol=1e-5, atol=1e-6)