Пример #1
0
    def test_padded_decode(self):
        """Test with a mask tensor."""
        num_heads, head_size = 2, 2
        from_seq_length = 4
        # TPU decoding should pre-allocate the entire sequence.
        batch_size = 3
        init_decode_length = from_seq_length

        # Directly tests the keras layer.
        cache = _create_cache(batch_size, init_decode_length, num_heads,
                              head_size)
        layer = attention.CachedAttention(num_heads=num_heads,
                                          key_size=head_size)

        # Generate data for the input (non-mask) tensors.
        from_data = tf.zeros((batch_size, from_seq_length, 8),
                             dtype=np.float32)
        decode_loop_step = 2
        mask_data = np.random.randint(2,
                                      size=(batch_size, from_seq_length,
                                            from_seq_length),
                                      dtype=np.int32)
        # Testing the invocation directly as Keras cannot consume inputs correctly.
        masked_output_data, cache = layer([from_data, from_data],
                                          mask_data,
                                          cache,
                                          decode_loop_step=decode_loop_step)
        self.assertEqual(masked_output_data.shape, (3, 4, 8))
        self.assertEqual(cache["value"].shape, (3, 4, 2, 2))
Пример #2
0
    def test_masked_attention(self):
        """Test with a mask tensor."""
        num_heads, head_size = 2, 2
        # Create a 3-dimensional input (the first dimension is implicit).
        from_seq_length = 4
        batch_size = 3
        # GPU/CPU case.
        init_decode_length = 0
        # Directly tests the keras layer.
        cache = _create_cache(batch_size, init_decode_length, num_heads,
                              head_size)
        layer = attention.CachedAttention(num_heads=num_heads,
                                          key_size=head_size)

        # Generate data for the input (non-mask) tensors.
        from_data = tf.zeros((batch_size, from_seq_length, 8),
                             dtype=np.float32)
        # Invoke the data with a random set of mask data. This should mask at least
        # one element.
        mask_data = np.random.randint(2,
                                      size=(batch_size, from_seq_length,
                                            from_seq_length))
        masked_output_data, cache = layer([from_data, from_data], mask_data,
                                          cache)
        self.assertEqual(masked_output_data.shape, (3, 4, 8))
        self.assertEqual(cache["value"].shape, (3, 4, 2, 2))

        # Tests inputs without cache.
        masked_output_data, cache = layer([from_data, from_data, mask_data])
        self.assertEqual(masked_output_data.shape, (3, 4, 8))
        self.assertIsNone(cache)
Пример #3
0
    def build(self, input_shape):
        # Self attention.
        self.self_attention = attention.CachedAttention(
            num_heads=self.num_attention_heads,
            key_size=self.attention_head_size,
            dropout=self.attention_probs_dropout_prob,
            kernel_initializer=self._kernel_initializer,
            name="self_attention")
        self.self_attention_output_dense = dense_einsum.DenseEinsum(
            output_shape=self.hidden_size,
            num_summed_dimensions=2,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            name="self_attention_output")
        self.self_attention_dropout = tf.keras.layers.Dropout(
            rate=self.hidden_dropout_prob)
        self.self_attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="self_attention_layer_norm", axis=-1, epsilon=1e-12))
        # Encoder-decoder attention.
        self.encdec_attention = self._cross_attention_cls(
            num_heads=self.num_attention_heads,
            key_size=self.attention_head_size,
            dropout=self.attention_probs_dropout_prob,
            output_shape=self.hidden_size,
            kernel_initializer=self._kernel_initializer,
            name="attention/encdec")

        self.encdec_attention_dropout = tf.keras.layers.Dropout(
            rate=self.hidden_dropout_prob)
        self.encdec_attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12))

        # Feed-forward projection.
        self.intermediate_dense = dense_einsum.DenseEinsum(
            output_shape=self.intermediate_size,
            activation=None,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            name="intermediate")
        self.intermediate_activation_layer = tf.keras.layers.Activation(
            self.intermediate_activation)
        self.output_dense = dense_einsum.DenseEinsum(
            output_shape=self.hidden_size,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            name="output")
        self.output_dropout = tf.keras.layers.Dropout(
            rate=self.hidden_dropout_prob)
        self.output_layer_norm = tf.keras.layers.LayerNormalization(
            name="output_layer_norm", axis=-1, epsilon=1e-12)
        super(TransformerDecoderLayer, self).build(input_shape)
Пример #4
0
    def build(self, input_shape):
        target_tensor_shape = tf.TensorShape(input_shape[0])
        if len(target_tensor_shape.as_list()) != 3:
            raise ValueError(
                "TransformerLayer expects a three-dimensional input of "
                "shape [batch, sequence, width].")
        hidden_size = target_tensor_shape[2]
        if hidden_size % self.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, self.num_attention_heads))
        self.attention_head_size = int(hidden_size / self.num_attention_heads)
        common_kwargs = dict(bias_initializer=self._bias_initializer,
                             kernel_regularizer=self._kernel_regularizer,
                             bias_regularizer=self._bias_regularizer,
                             activity_regularizer=self._activity_regularizer,
                             kernel_constraint=self._kernel_constraint,
                             bias_constraint=self._bias_constraint)
        # Self attention.
        self.self_attention = attention.CachedAttention(
            num_heads=self.num_attention_heads,
            key_dim=self.attention_head_size,
            dropout=self.attention_dropout_rate,
            use_bias=self._use_bias,
            kernel_initializer=self._attention_initializer,
            name="self_attention",
            **common_kwargs)
        self.self_attention_output_dense = tf.keras.layers.experimental.EinsumDense(
            "abc,cd->abd",
            output_shape=(None, hidden_size),
            bias_axes="d",
            kernel_initializer=self._kernel_initializer,
            name="output",
            **common_kwargs)
        self.self_attention_dropout = tf.keras.layers.Dropout(
            rate=self.dropout_rate)
        self.self_attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="self_attention_layer_norm",
            axis=-1,
            epsilon=self._norm_epsilon))
        # Encoder-decoder attention.
        self.encdec_attention = self._cross_attention_cls(
            num_heads=self.num_attention_heads,
            key_dim=self.attention_head_size,
            dropout=self.attention_dropout_rate,
            output_shape=hidden_size,
            use_bias=self._use_bias,
            kernel_initializer=self._attention_initializer,
            name="attention/encdec",
            **common_kwargs)

        self.encdec_attention_dropout = tf.keras.layers.Dropout(
            rate=self.dropout_rate)
        self.encdec_attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="attention/encdec_output_layer_norm",
            axis=-1,
            epsilon=self._norm_epsilon))

        # Feed-forward projection.
        self.intermediate_dense = tf.keras.layers.experimental.EinsumDense(
            "abc,cd->abd",
            output_shape=(None, self.intermediate_size),
            bias_axes="d",
            kernel_initializer=self._kernel_initializer,
            name="intermediate",
            **common_kwargs)
        self.intermediate_activation_layer = tf.keras.layers.Activation(
            self.intermediate_activation)
        self._intermediate_dropout_layer = tf.keras.layers.Dropout(
            rate=self._intermediate_dropout)
        self.output_dense = tf.keras.layers.experimental.EinsumDense(
            "abc,cd->abd",
            output_shape=(None, hidden_size),
            bias_axes="d",
            kernel_initializer=self._kernel_initializer,
            name="output",
            **common_kwargs)
        self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
        self.output_layer_norm = tf.keras.layers.LayerNormalization(
            name="output_layer_norm", axis=-1, epsilon=self._norm_epsilon)
        super().build(input_shape)
Пример #5
0
    def build(self, input_shape):
        target_tensor_shape = tf.TensorShape(input_shape[0])
        if len(target_tensor_shape) != 3:
            raise ValueError(
                "TransformerLayer expects a three-dimensional input of "
                "shape [batch, sequence, width].")
        hidden_size = target_tensor_shape[2]
        if hidden_size % self.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, self.num_attention_heads))
        self.attention_head_size = int(hidden_size / self.num_attention_heads)
        # Self attention.
        self.self_attention = attention.CachedAttention(
            num_heads=self.num_attention_heads,
            key_size=self.attention_head_size,
            dropout=self.attention_dropout_rate,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="self_attention")
        self.self_attention_output_dense = dense_einsum.DenseEinsum(
            output_shape=hidden_size,
            num_summed_dimensions=2,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="self_attention_output")
        self.self_attention_dropout = tf.keras.layers.Dropout(
            rate=self.dropout_rate)
        self.self_attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="self_attention_layer_norm", axis=-1, epsilon=1e-12))
        # Encoder-decoder attention.
        self.encdec_attention = self._cross_attention_cls(
            num_heads=self.num_attention_heads,
            key_size=self.attention_head_size,
            dropout=self.attention_dropout_rate,
            output_shape=hidden_size,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="attention/encdec")

        self.encdec_attention_dropout = tf.keras.layers.Dropout(
            rate=self.dropout_rate)
        self.encdec_attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12))

        # Feed-forward projection.
        self.intermediate_dense = dense_einsum.DenseEinsum(
            output_shape=self.intermediate_size,
            activation=None,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="intermediate")
        self.intermediate_activation_layer = tf.keras.layers.Activation(
            self.intermediate_activation)
        self.output_dense = dense_einsum.DenseEinsum(
            output_shape=hidden_size,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            kernel_regularizer=self._kernel_regularizer,
            bias_regularizer=self._bias_regularizer,
            activity_regularizer=self._activity_regularizer,
            kernel_constraint=self._kernel_constraint,
            bias_constraint=self._bias_constraint,
            name="output")
        self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
        self.output_layer_norm = tf.keras.layers.LayerNormalization(
            name="output_layer_norm", axis=-1, epsilon=1e-12)
        super(TransformerDecoderLayer, self).build(input_shape)