예제 #1
0
 def _build_embedding_layer(self):
     embedding_width = self._config_dict[
         'embedding_width'] or self._config_dict['hidden_size']
     return layers.OnDeviceEmbedding(
         vocab_size=self._config_dict['vocab_size'],
         embedding_width=embedding_width,
         initializer=self._config_dict['initializer'],
         name='word_embeddings')
예제 #2
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            inner_dim=3072,
            inner_activation=lambda x: tf.keras.activations.gelu(
                x, approximate=True),
            output_dropout=0.1,
            attention_dropout=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            output_range=None,
            embedding_width=None,
            embedding_layer=None,
            **kwargs):
        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        word_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(None, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            embedding_layer_inst = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=initializer,
                name='word_embeddings')
        else:
            embedding_layer_inst = embedding_layer
        word_embeddings = embedding_layer_inst(word_ids)

        # Always uses dynamic slicing for simplicity.
        position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = position_embedding_layer(word_embeddings)
        type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')
        type_embeddings = type_embedding_layer(type_ids)

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])

        embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        embeddings = embedding_norm_layer(embeddings)
        embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')
            embeddings = embedding_projection(embeddings)
        else:
            embedding_projection = None

        transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()(data, mask)
        encoder_outputs = []
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None
            layer = layers.TransformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        last_encoder_output = encoder_outputs[-1]
        # Applying a tf.slice op (through subscript notation) to a Keras tensor
        # like this will create a SliceOpLambda layer. This is better than a Lambda
        # layer with Python code, because that is fundamentally less portable.
        first_token_tensor = last_encoder_output[:, 0, :]
        pooler_layer = tf.keras.layers.Dense(units=hidden_size,
                                             activation='tanh',
                                             kernel_initializer=initializer,
                                             name='pooler_transform')
        cls_output = pooler_layer(first_token_tensor)

        outputs = dict(
            sequence_output=encoder_outputs[-1],
            pooled_output=cls_output,
            encoder_outputs=encoder_outputs,
        )

        # Once we've created the network using the Functional API, we call
        # super().__init__ as though we were invoking the Functional API Model
        # constructor, resulting in this object having all the properties of a model
        # created using the Functional API. Once super().__init__ is called, we
        # can assign attributes to `self` - note that all `self` assignments are
        # below this line.
        super(BertEncoder, self).__init__(inputs=[word_ids, mask, type_ids],
                                          outputs=outputs,
                                          **kwargs)

        config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
        }

        # We are storing the config dict as a namedtuple here to ensure checkpoint
        # compatibility with an earlier version of this model which did not track
        # the config dict attribute. TF does not track immutable attrs which
        # do not contain Trackables, so by creating a config namedtuple instead of
        # a dict we avoid tracking it.
        config_cls = collections.namedtuple('Config', config_dict.keys())
        self._config = config_cls(**config_dict)
        self._pooler_layer = pooler_layer
        self._transformer_layers = transformer_layers
        self._embedding_norm_layer = embedding_norm_layer
        self._embedding_layer = embedding_layer_inst
        self._position_embedding_layer = position_embedding_layer
        self._type_embedding_layer = type_embedding_layer
        if embedding_projection is not None:
            self._embedding_projection = embedding_projection
예제 #3
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            inner_dim=3072,
            inner_activation=lambda x: tf.keras.activations.gelu(
                x, approximate=True),
            output_dropout=0.1,
            attention_dropout=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            output_range=None,
            embedding_width=None,
            **kwargs):
        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
        }

        word_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(None, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size
        self._embedding_layer = self._build_embedding_layer()
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = self._position_embedding_layer(word_embeddings)
        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')
        type_embeddings = self._type_embedding_layer(type_ids)

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        embeddings = self._embedding_norm_layer(embeddings)
        embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')
            embeddings = self._embedding_projection(embeddings)

        self._transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()(data, mask)
        encoder_outputs = []
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None
            layer = layers.TransformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        last_enocder_output = encoder_outputs[-1]
        # Applying a tf.slice op (through subscript notation) to a Keras tensor
        # like this will create a SliceOpLambda layer. This is better than a Lambda
        # layer with Python code, because that is fundamentally less portable.
        first_token_tensor = last_enocder_output[:, 0, :]
        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')
        cls_output = self._pooler_layer(first_token_tensor)

        outputs = dict(
            sequence_output=encoder_outputs[-1],
            pooled_output=cls_output,
            encoder_outputs=encoder_outputs,
        )
        super(BertEncoder, self).__init__(inputs=[word_ids, mask, type_ids],
                                          outputs=outputs,
                                          **kwargs)