def test_attention_layer(self):
     hidden_size = 64
     num_heads = 4
     dropout = 0.5
     layer = attention_layer.SelfAttention(hidden_size, num_heads, dropout)
     self.assertDictEqual(
         layer.get_config(), {
             "hidden_size": hidden_size,
             "num_heads": num_heads,
             "attention_dropout": dropout,
         })
     length = 2
     x = tf.ones([1, length, hidden_size])
     bias = tf.ones([1])
     cache = {
         "k": tf.zeros([1, 0, hidden_size]),
         "v": tf.zeros([1, 0, hidden_size]),
     }
     y = layer(x, bias, training=True, cache=cache)
     self.assertEqual(y.shape, (
         1,
         length,
         64,
     ))
     self.assertEqual(cache["k"].shape, (
         1,
         length,
         64,
     ))
     self.assertEqual(cache["v"].shape, (
         1,
         length,
         64,
     ))
Exemplo n.º 2
0
  def build(self, input_shape):
    """Builds the encoder stack."""
    params = self.params
    for _ in range(params["num_hidden_layers"]):
      # Create sublayers for each layer.
      self_attention_layer = attention_layer.SelfAttention(
          params["hidden_size"], params["num_heads"],
          params["attention_dropout"])
      feed_forward_network = ffn_layer.FeedForwardNetwork(
          params["hidden_size"], params["filter_size"], params["relu_dropout"])

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params),
          PrePostProcessingWrapper(feed_forward_network, params)
      ])

    # Create final layer normalization layer.
    self.output_normalization = LayerNormalization(params["hidden_size"])
    super(EncoderStack, self).build(input_shape)
Exemplo n.º 3
0
  def build(self, input_shape):
    """Builds the decoder stack."""
    params = self.params
    for _ in range(params["num_hidden_layers"]):
      self_attention_layer = attention_layer.SelfAttention(
          params["hidden_size"], params["num_heads"],
          params["attention_dropout"])
      enc_dec_attention_layer = attention_layer.Attention(
          params["hidden_size"], params["num_heads"],
          params["attention_dropout"])
      feed_forward_network = ffn_layer.FeedForwardNetwork(
          params["hidden_size"], params["filter_size"], params["relu_dropout"])

      self.layers.append([
          PrePostProcessingWrapper(self_attention_layer, params),
          PrePostProcessingWrapper(enc_dec_attention_layer, params),
          PrePostProcessingWrapper(feed_forward_network, params)
      ])
    self.output_normalization = tf.keras.layers.LayerNormalization(
        epsilon=1e-6, dtype="float32")
    super(DecoderStack, self).build(input_shape)