def build(self, input_shape):
     """Builds the layer."""
     # Layers for linearly projecting the queries, keys, and values.
     size_per_head = self.hidden_size // self.num_heads
     self.query_dense_layer = layers.DenseEinsum(
         output_shape=(self.num_heads, size_per_head),
         kernel_initializer="glorot_uniform",
         use_bias=False,
         name="query")
     self.key_dense_layer = layers.DenseEinsum(
         output_shape=(self.num_heads, size_per_head),
         kernel_initializer="glorot_uniform",
         use_bias=False,
         name="key")
     self.value_dense_layer = layers.DenseEinsum(
         output_shape=(self.num_heads, size_per_head),
         kernel_initializer="glorot_uniform",
         use_bias=False,
         name="value")
     self.output_dense_layer = layers.DenseEinsum(
         output_shape=self.hidden_size,
         num_summed_dimensions=2,
         kernel_initializer="glorot_uniform",
         use_bias=False,
         name="output_transform")
     super(Attention, self).build(input_shape)
    def build(self, input_shape):
        # Self attention.
        self.self_attention = layers.CachedAttention(
            num_heads=self.num_attention_heads,
            key_size=self.attention_head_size,
            dropout_rate=self.attention_probs_dropout_prob,
            kernel_initializer=self._kernel_initializer,
            name="self_attention")
        self.self_attention_output_dense = layers.DenseEinsum(
            output_shape=self.hidden_size,
            num_summed_dimensions=2,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            name="self_attention_output")
        self.self_attention_dropout = tf.keras.layers.Dropout(
            rate=self.hidden_dropout_prob)
        self.self_attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="self_attention_layer_norm", axis=-1, epsilon=1e-12))
        # Encoder-decoder attention.
        self.encdec_attention = self._cross_attention_cls(
            num_heads=self.num_attention_heads,
            key_size=self.attention_head_size,
            dropout_rate=self.attention_probs_dropout_prob,
            output_shape=self.hidden_size,
            kernel_initializer=self._kernel_initializer,
            name="attention/encdec")
        # TODO(hongkuny): Remove when checkpoint backward compatibility is resolved.
        # pylint: disable=protected-access
        self.self_attention.build(input_shape)
        self.self_attention_output_dense = self.self_attention._output_dense
        self.encdec_attention.build(input_shape)
        self.encdec_attention_output_dense = self.encdec_attention._output_dense

        self.encdec_attention_dropout = tf.keras.layers.Dropout(
            rate=self.hidden_dropout_prob)
        self.encdec_attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12))

        # Feed-forward projection.
        self.intermediate_dense = layers.DenseEinsum(
            output_shape=self.intermediate_size,
            activation=None,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            name="intermediate")
        self.intermediate_activation_layer = tf.keras.layers.Activation(
            self.intermediate_activation)
        self.output_dense = layers.DenseEinsum(
            output_shape=self.hidden_size,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            name="output")
        self.output_dropout = tf.keras.layers.Dropout(
            rate=self.hidden_dropout_prob)
        self.output_layer_norm = tf.keras.layers.LayerNormalization(
            name="output_layer_norm", axis=-1, epsilon=1e-12)
        super(TransformerDecoderBlock, self).build(input_shape)
    def build(self, unused_input_shapes):
        # Self attention.
        self.self_attention = layers.CachedAttention(
            num_heads=self.num_attention_heads,
            head_size=self.attention_head_size,
            dropout_rate=self.attention_probs_dropout_prob,
            kernel_initializer=self._kernel_initializer,
            name="self_attention")
        self.self_attention_output_dense = layers.DenseEinsum(
            output_shape=self.hidden_size,
            num_summed_dimensions=2,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            name="self_attention_output")
        self.self_attention_dropout = tf.keras.layers.Dropout(
            rate=self.hidden_dropout_prob)
        self.self_attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="self_attention_layer_norm", axis=-1, epsilon=1e-12))
        # Encoder-decoder attention.
        self.encdec_attention = self._cross_attention_cls(
            num_heads=self.num_attention_heads,
            head_size=self.attention_head_size,
            dropout_rate=self.attention_probs_dropout_prob,
            kernel_initializer=self._kernel_initializer,
            name="attention/encdec")
        self.encdec_attention_output_dense = layers.DenseEinsum(
            output_shape=self.hidden_size,
            num_summed_dimensions=2,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            name="attention/encdec_output")
        self.encdec_attention_dropout = tf.keras.layers.Dropout(
            rate=self.hidden_dropout_prob)
        self.encdec_attention_layer_norm = (tf.keras.layers.LayerNormalization(
            name="attention/encdec_output_layer_norm", axis=-1, epsilon=1e-12))

        # Feed-forward projection.
        self.intermediate_dense = layers.DenseEinsum(
            output_shape=self.intermediate_size,
            activation=None,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            name="intermediate")
        self.intermediate_activation_layer = tf.keras.layers.Activation(
            self.intermediate_activation)
        self.output_dense = layers.DenseEinsum(
            output_shape=self.hidden_size,
            kernel_initializer=self._kernel_initializer,
            bias_initializer=self._bias_initializer,
            name="output")
        self.output_dropout = tf.keras.layers.Dropout(
            rate=self.hidden_dropout_prob)
        self.output_layer_norm = tf.keras.layers.LayerNormalization(
            name="output_layer_norm", axis=-1, epsilon=1e-12)
        super(TransformerDecoderBlock, self).build(unused_input_shapes)
Пример #4
0
    def build(self, input_shape):
        """Builds the layer."""
        # Layers for linearly projecting the queries, keys, and values.
        size_per_head = self.hidden_size // self.num_heads

        def _glorot_initializer(fan_in, fan_out):
            limit = math.sqrt(6.0 / (fan_in + fan_out))
            return tf.keras.initializers.RandomUniform(minval=-limit,
                                                       maxval=limit)

        attention_initializer = _glorot_initializer(input_shape.as_list()[-1],
                                                    self.hidden_size)
        self.query_dense_layer = layers.DenseEinsum(
            output_shape=(self.num_heads, size_per_head),
            kernel_initializer=attention_initializer,
            use_bias=False,
            name="query")
        self.key_dense_layer = layers.DenseEinsum(
            output_shape=(self.num_heads, size_per_head),
            kernel_initializer=attention_initializer,
            use_bias=False,
            name="key")
        self.value_dense_layer = layers.DenseEinsum(
            output_shape=(self.num_heads, size_per_head),
            kernel_initializer=attention_initializer,
            use_bias=False,
            name="value")

        output_initializer = _glorot_initializer(self.hidden_size,
                                                 self.hidden_size)
        self.output_dense_layer = layers.DenseEinsum(
            output_shape=self.hidden_size,
            num_summed_dimensions=2,
            kernel_initializer=output_initializer,
            use_bias=False,
            name="output_transform")
        super(Attention, self).build(input_shape)
Пример #5
0
 def build(self, unused_input_shapes):
     self._query_dense = layers.DenseEinsum(
         output_shape=(self._num_heads, self._head_size),
         kernel_initializer=self._kernel_initializer,
         bias_initializer=self._bias_initializer,
         kernel_regularizer=self._kernel_regularizer,
         bias_regularizer=self._bias_regularizer,
         activity_regularizer=self._activity_regularizer,
         kernel_constraint=self._kernel_constraint,
         bias_constraint=self._bias_constraint,
         dtype=self.dtype,
         name="encdocatt_query")
     self._key_dense = layers.DenseEinsum(
         output_shape=(self._num_heads, self._head_size),
         kernel_initializer=self._kernel_initializer,
         bias_initializer=self._bias_initializer,
         kernel_regularizer=self._kernel_regularizer,
         bias_regularizer=self._bias_regularizer,
         activity_regularizer=self._activity_regularizer,
         kernel_constraint=self._kernel_constraint,
         bias_constraint=self._bias_constraint,
         dtype=self.dtype,
         name="encdocatt_key")
     super(DocAttention, self).build(unused_input_shapes)
    def __init__(
            self,
            vocab_size,
            embedding_width=128,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'embedding_width': embedding_width,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
        }

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=max_sequence_length)
        position_embeddings = self._position_embedding_layer(word_embeddings)

        type_embeddings = (layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')(type_ids))

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))
        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            embeddings = layers.DenseEinsum(
                output_shape=hidden_size,
                kernel_initializer=initializer,
                name='embedding_projection')(embeddings)

        data = embeddings
        attention_mask = layers.SelfAttentionMask()([data, mask])
        shared_layer = layers.Transformer(
            num_attention_heads=num_attention_heads,
            intermediate_size=intermediate_size,
            intermediate_activation=activation,
            dropout_rate=dropout_rate,
            attention_dropout_rate=attention_dropout_rate,
            kernel_initializer=initializer,
            name='transformer')
        for _ in range(num_layers):
            data = shared_layer([data, attention_mask])

        first_token_tensor = (tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data))
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)

        super(AlbertTransformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=[data, cls_output],
                             **kwargs)