예제 #1
0
    def _attention_scores(self, query, key, mask=None):
        """Calculates attention scores as a query-key dot product.

    Args:
      query: Query tensor of shape `[batch_size, sequence_length, Tq]`.
      key: Key tensor of shape `[batch_size, sequence_length, Tv]`.
      mask: mask tensor of shape `[batch_size, sequence_length]`.

    Returns:
      Tensor of shape `[batch_size, sequence_length, sequence_length]`.
    """
        scores = tf.linalg.matmul(query, key, transpose_b=True)

        if mask is not None:
            mask = layers.SelfAttentionMask()(scores, mask)
            # Prevent pointing to self (zeros down the diagonal).
            diagonal_mask = tf.linalg.diag(tf.zeros(
                (tf.shape(mask)[0], self._seq_length)),
                                           padding_value=1)
            diagonal_mask = tf.cast(diagonal_mask, tf.float32)
            mask = tf.math.multiply(diagonal_mask, mask)
            # As this is pre softmax (exp) as such we set the values very low.
            mask_add = -1e9 * (1. - mask)
            scores = scores * mask + mask_add

        return scores
예제 #2
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        embedding_width = 768
        dropout_rate = 0.1
        initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02)

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=30522,
            embedding_width=embedding_width,
            initializer=initializer,
            name="word_embeddings",
        )

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=512,
            name="position_embedding",
        )
        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=2,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name="type_embeddings",
        )
        self._add = tf.keras.layers.Add()
        self._layer_norm = tf.keras.layers.LayerNormalization(
            name="embeddings/layer_norm",
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)
        self._dropout = tf.keras.layers.Dropout(rate=dropout_rate)

        self._attention_mask = layers.SelfAttentionMask()
        self._transformer_layers = []
        for i in range(12):
            layer = layers.Transformer(
                num_attention_heads=12,
                intermediate_size=3072,
                intermediate_activation=activations.gelu,
                dropout_rate=dropout_rate,
                attention_dropout_rate=0.1,
                output_range=None,
                kernel_initializer=initializer,
                name="transformer/layer_%d" % i,
            )
            self._transformer_layers.append(layer)

        self._lambda = tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))
        self._pooler_layer = tf.keras.layers.Dense(
            units=embedding_width,
            activation="tanh",
            kernel_initializer=initializer,
            name="pooler_transform",
        )
예제 #3
0
 def __init__(self,
              emb_dim=768,
              num_layers=6,
              num_heads=12,
              mlp_dim=3072,
              mlp_act=activations.approximate_gelu,
              output_dropout=0.1,
              attention_dropout=0.1,
              mlp_dropout=0.1,
              norm_first=True,
              norm_input=False,
              norm_output=True,
              causal=False,
              trainable_posemb=False,
              posemb_init=initializers.HarmonicEmbeddings(scale_factor=1e-4,
                                                          max_freq=1.0),
              aaemb_init=tf.initializers.RandomNormal(stddev=1.0),
              kernel_init=tf.initializers.GlorotUniform(),
              aaemb_scale_factor=None,
              max_len=1024,
              **kwargs):
     super().__init__(**kwargs)
     self._causal = causal
     self.posemb_layer = nlp_layers.PositionEmbedding(
         max_length=max_len,
         initializer=posemb_init,
         trainable=trainable_posemb,
         name='embeddings/positional')
     self.aaemb_layer = nlp_layers.OnDeviceEmbedding(
         vocab_size=len(self._vocab),
         embedding_width=emb_dim,
         initializer=aaemb_init,
         scale_factor=aaemb_scale_factor,
         name='embeddings/aminoacid')
     layer_norm_cls = functools.partial(tf.keras.layers.LayerNormalization,
                                        axis=-1,
                                        epsilon=1e-12)
     self._input_norm_layer = (layer_norm_cls(
         name='embeddings/layer_norm') if norm_input else None)
     self._output_norm_layer = (layer_norm_cls(
         name='output/layer_norm') if norm_output else None)
     self._dropout_layer = tf.keras.layers.Dropout(
         rate=output_dropout, name='embeddings/dropout')
     self._attention_mask = nlp_layers.SelfAttentionMask()
     self._transformer_layers = []
     for i in range(num_layers):
         self._transformer_layers.append(
             nlp_layers.TransformerEncoderBlock(
                 num_attention_heads=num_heads,
                 inner_dim=mlp_dim,
                 inner_activation=mlp_act,
                 output_dropout=output_dropout,
                 attention_dropout=attention_dropout,
                 inner_dropout=mlp_dropout,
                 kernel_initializer=kernel_init,
                 norm_first=norm_first,
                 name=f'transformer/layer_{i}'))
예제 #4
0
 def __init__(self, vocab_size, hidden_size):
   super().__init__()
   self.inputs = [
       tf.keras.layers.Input(
           shape=(None,), dtype=tf.int32, name="input_word_ids"),
       tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
   ]
   self.attention_mask = layers.SelfAttentionMask()
   self.embedding_layer = layers.OnDeviceEmbedding(
       vocab_size=vocab_size,
       embedding_width=hidden_size,
       initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
       name="word_embeddings")
예제 #5
0
  def call(self, sequence_data, input_mask):
    """Compute inner-products of hidden vectors with sampled element embeddings.

    Args:
      sequence_data: A [batch_size, seq_length, num_hidden] tensor.
      input_mask: A [batch_size, seq_length] binary mask to separate the input
        from the padding.

    Returns:
      A [batch_size, seq_length] tensor.
    """
    attention_mask = layers.SelfAttentionMask()([sequence_data, input_mask])
    data = sequence_data
    for hidden_layer in self.hidden_layers:
      data = hidden_layer([sequence_data, attention_mask])
    rtd_logits = self.rtd_head(self.dense(data))
    return tf.squeeze(rtd_logits, axis=-1)
예제 #6
0
    def call(self,
             inputs: tf.Tensor,
             training: bool = True,
             list_mask: Optional[tf.Tensor] = None) -> tf.Tensor:
        """Calls the document interaction layer to apply cross-document attention.

    Args:
      inputs: A tensor of shape [batch_size, list_size, feature_dims].
      training: Whether in training or inference mode.
      list_mask: A boolean tensor of shape [batch_size, list_size], which is
        True for a valid example and False for invalid one. If this is `None`,
        then all examples are treated as valid.

    Returns:
      A tensor of shape [batch_size, list_size, head_size].
    """
        batch_size = tf.shape(inputs)[0]
        list_size = tf.shape(inputs)[1]
        if list_mask is None:
            list_mask = tf.ones(shape=(batch_size, list_size), dtype=tf.bool)
        input_tensor = self._input_projection(inputs, training=training)

        list_mask = tf.cast(list_mask, dtype=tf.int32)
        attention_mask = nlp_modeling_layers.SelfAttentionMask()(
            [list_mask, list_mask])

        for attention_layer, dropout_layer, norm_layer in self._attention_layers:
            output = attention_layer(query=input_tensor,
                                     value=input_tensor,
                                     attention_mask=attention_mask,
                                     training=training)
            output = dropout_layer(output, training=training)
            # Applying residual network here, similar to logic in Transformer.
            input_tensor = norm_layer(output + input_tensor, training=training)

        return input_tensor
예제 #7
0
  def __init__(self,
               vocab_size,
               type_vocab_size,
               hidden_size,
               max_seq_length,
               initializer,
               dropout_rate,
               use_position_id=False,
               pack_multiple_sequences=False,
               **kwargs):
    initializer = tf.keras.initializers.get(initializer)
    config_dict = {
        'vocab_size': vocab_size,
        'type_vocab_size': type_vocab_size,
        'hidden_size': hidden_size,
        'max_seq_length': max_seq_length,
        'initializer': tf.keras.initializers.serialize(initializer),
        'dropout_rate': dropout_rate,
        'use_position_id': use_position_id,
        'pack_multiple_sequences': pack_multiple_sequences,
    }

    word_ids = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_word_ids')
    mask = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_mask')
    type_ids = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_type_ids')
    inputs = {
        'input_word_ids': word_ids,
        'input_mask': mask,
        'input_type_ids': type_ids,
    }
    if use_position_id:
      position_ids = tf.keras.layers.Input(
          shape=(None,), dtype=tf.int32, name='position_ids')
      inputs['position_ids'] = position_ids
    else:
      position_ids = None

    if pack_multiple_sequences:
      sub_seq_mask = PackedSequenceMask()(word_ids)
    else:
      sub_seq_mask = None

    embedding_layer = layers.OnDeviceEmbedding(
        vocab_size=vocab_size,
        embedding_width=hidden_size,
        initializer=initializer,
        name='word_embeddings')
    word_embeddings = embedding_layer(word_ids)

    # Always uses dynamic slicing for simplicity.
    position_embedding_layer = PositionEmbeddingWithSubSeqMask(
        initializer=initializer,
        use_dynamic_slicing=True,
        max_sequence_length=max_seq_length,
        name='position_embedding')
    position_embeddings = position_embedding_layer(
        word_embeddings, position_ids, sub_seq_mask)

    type_embeddings = (
        layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')(type_ids))

    embeddings = tf.keras.layers.Add()(
        [word_embeddings, position_embeddings, type_embeddings])
    embeddings = tf.keras.layers.LayerNormalization(
        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)(
            embeddings)
    embeddings = tf.keras.layers.Dropout(
        rate=dropout_rate, dtype=tf.float32)(
            embeddings)

    attention_mask = layers.SelfAttentionMask()([embeddings, mask])
    if sub_seq_mask is not None:
      attention_mask = tf.keras.layers.Lambda(
          lambda x: x[0] * tf.cast(x[1], x[0].dtype))(
              [attention_mask, sub_seq_mask])

    outputs = [embeddings, attention_mask]
    super(PackedSequenceEmbedding, self).__init__(
        inputs=inputs, outputs=outputs, **kwargs)
    # TF does not track immutable attrs which do not contain Trackables,
    # so by creating a config namedtuple instead of a dict we avoid tracking it.
    config_cls = collections.namedtuple('Config', config_dict.keys())
    self._config = config_cls(**config_dict)
    self._embedding_layer = embedding_layer
    self._position_embedding_layer = position_embedding_layer
예제 #8
0
    def __init__(self,
                 word_vocab_size=30522,
                 word_embed_size=128,
                 type_vocab_size=2,
                 max_sequence_length=512,
                 num_blocks=24,
                 hidden_size=512,
                 num_attention_heads=4,
                 intermediate_size=512,
                 intermediate_act_fn='relu',
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 intra_bottleneck_size=128,
                 initializer_range=0.02,
                 use_bottleneck_attention=False,
                 key_query_shared_bottleneck=True,
                 num_feedforward_networks=4,
                 normalization_type='no_norm',
                 classifier_activation=False,
                 **kwargs):
        """Class initialization.

    Arguments:
      word_vocab_size: Number of words in the vocabulary.
      word_embed_size: Word embedding size.
      type_vocab_size: Number of word types.
      max_sequence_length: Maximum length of input sequence.
      num_blocks: Number of transformer block in the encoder model.
      hidden_size: Hidden size for the transformer block.
      num_attention_heads: Number of attention heads in the transformer block.
      intermediate_size: The size of the "intermediate" (a.k.a., feed
        forward) layer.
      intermediate_act_fn: The non-linear activation function to apply
        to the output of the intermediate/feed-forward layer.
      hidden_dropout_prob: Dropout probability for the hidden layers.
      attention_probs_dropout_prob: Dropout probability of the attention
        probabilities.
      intra_bottleneck_size: Size of bottleneck.
      initializer_range: The stddev of the truncated_normal_initializer for
        initializing all weight matrices.
      use_bottleneck_attention: Use attention inputs from the bottleneck
        transformation. If true, the following `key_query_shared_bottleneck`
        will be ignored.
      key_query_shared_bottleneck: Whether to share linear transformation for
        keys and queries.
      num_feedforward_networks: Number of stacked feed-forward networks.
      normalization_type: The type of normalization_type, only 'no_norm' and
        'layer_norm' are supported. 'no_norm' represents the element-wise linear
        transformation for the student model, as suggested by the original
        MobileBERT paper. 'layer_norm' is used for the teacher model.
      classifier_activation: If using the tanh activation for the final
        representation of the [CLS] token in fine-tuning.
      **kwargs: Other keyworded and arguments.
    """
        self._self_setattr_tracking = False
        initializer = tf.keras.initializers.TruncatedNormal(
            stddev=initializer_range)

        # layer instantiation
        self.embedding_layer = layers.MobileBertEmbedding(
            word_vocab_size=word_vocab_size,
            word_embed_size=word_embed_size,
            type_vocab_size=type_vocab_size,
            output_embed_size=hidden_size,
            max_sequence_length=max_sequence_length,
            normalization_type=normalization_type,
            initializer=initializer,
            dropout_rate=hidden_dropout_prob)

        self._transformer_layers = []
        for layer_idx in range(num_blocks):
            transformer = layers.MobileBertTransformer(
                hidden_size=hidden_size,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_act_fn=intermediate_act_fn,
                hidden_dropout_prob=hidden_dropout_prob,
                attention_probs_dropout_prob=attention_probs_dropout_prob,
                intra_bottleneck_size=intra_bottleneck_size,
                use_bottleneck_attention=use_bottleneck_attention,
                key_query_shared_bottleneck=key_query_shared_bottleneck,
                num_feedforward_networks=num_feedforward_networks,
                normalization_type=normalization_type,
                initializer=initializer,
                name=f'transformer_layer_{layer_idx}')
            self._transformer_layers.append(transformer)

        # input tensor
        input_ids = tf.keras.layers.Input(shape=(None, ),
                                          dtype=tf.int32,
                                          name='input_word_ids')
        input_mask = tf.keras.layers.Input(shape=(None, ),
                                           dtype=tf.int32,
                                           name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')
        self.inputs = [input_ids, input_mask, type_ids]
        attention_mask = layers.SelfAttentionMask()([input_ids, input_mask])

        # build the computation graph
        all_layer_outputs = []
        all_attention_scores = []
        embedding_output = self.embedding_layer(input_ids, type_ids)
        all_layer_outputs.append(embedding_output)
        prev_output = embedding_output

        for layer_idx in range(num_blocks):
            layer_output, attention_score = self._transformer_layers[
                layer_idx](prev_output,
                           attention_mask,
                           return_attention_scores=True)
            all_layer_outputs.append(layer_output)
            all_attention_scores.append(attention_score)
            prev_output = layer_output
        first_token = tf.squeeze(prev_output[:, 0:1, :], axis=1)

        if classifier_activation:
            self._pooler_layer = tf.keras.layers.experimental.EinsumDense(
                'ab,bc->ac',
                output_shape=hidden_size,
                activation=tf.tanh,
                bias_axes='c',
                kernel_initializer=initializer,
                name='pooler')
            first_token = self._pooler_layer(first_token)
        else:
            self._pooler_layer = None

        outputs = dict(sequence_output=prev_output,
                       pooled_output=first_token,
                       encoder_outputs=all_layer_outputs,
                       attention_scores=all_attention_scores)

        super(MobileBERTEncoder, self).__init__(inputs=self.inputs,
                                                outputs=outputs,
                                                **kwargs)
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            return_all_encoder_outputs=False,
            output_range=None,
            embedding_width=None,
            net2net_ratio=None,
            net2net_layers=None,
            lightatt_layers=None,
            input_pool_name=None,
            input_pool_size=None,
            **kwargs):
        """Bi-directional Transformer-based encoder network.

    This network implements a bi-directional Transformer-based encoder as
    described in "BERT: Pre-training of Deep Bidirectional Transformers for
    Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
    embedding lookups and transformer layers, but not the masked language model
    or classification task networks.

    The default values for this object are taken from the BERT-Base
    implementation
    in "BERT: Pre-training of Deep Bidirectional Transformers for Language
    Understanding".

    Arguments:
      vocab_size: The size of the token vocabulary.
      hidden_size: The size of the transformer hidden layers.
      num_layers: The number of transformer layers.
      num_attention_heads: The number of attention heads for each transformer.
        The hidden size must be divisible by the number of attention heads.
      sequence_length: The sequence length that this encoder expects. If None,
        the sequence length is dynamic; if an integer, the encoder will require
        sequences padded to this length.
      max_sequence_length: The maximum sequence length that this encoder can
        consume. If None, max_sequence_length uses the value from sequence
        length. This determines the variable shape for positional embeddings.
      type_vocab_size: The number of types that the 'type_ids' input can take.
      intermediate_size: The intermediate size for the transformer layers.
      activation: The activation to use for the transformer layers.
      dropout_rate: The dropout rate to use for the transformer layers.
      attention_dropout_rate: The dropout rate to use for the attention layers
        within the transformer layers.
      initializer: The initialzer to use for all weights in this encoder.
      return_all_encoder_outputs: Whether to output sequence embedding outputs
        of all encoder transformer layers.
      output_range: the sequence output range, [0, output_range), by slicing the
        target sequence of the last transformer layer. `None` means the entire
        target sequence will attend to the source sequence, which yeilds the
        full output.
      embedding_width: The width of the word embeddings. If the embedding width
        is not equal to hidden size, embedding parameters will be factorized
        into two matrices in the shape of ['vocab_size', 'embedding_width'] and
        ['embedding_width', 'hidden_size'] ('embedding_width' is usually much
        smaller than 'hidden_size').
       net2net_ratio: net2net ratio for the small fully connected matrices.
       net2net_layers: number of layers with net2net treatment.
       lightatt_layers: number of layers with light attention,
       input_pool_name: input_pool_name,
       input_pool_size: input_pool_size,
       **kwargs: **kwargs
    """
        super(TransformerEncoder, self).__init__()

        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self.net2net_ratio = net2net_ratio
        self.net2net_layers = net2net_layers
        self.lightatt_layers = lightatt_layers
        self.input_pool_name = input_pool_name
        self.input_pool_size = input_pool_size

        if embedding_width is None:
            embedding_width = hidden_size
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'return_all_encoder_outputs': return_all_encoder_outputs,
            'output_range': output_range,
            'embedding_width': embedding_width,
            'net2net_ratio': net2net_ratio,
            'net2net_layers': net2net_layers,
            'lightatt_layers': lightatt_layers,
            'input_pool_name': input_pool_name,
            'input_pool_size': input_pool_size,
        }

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            name='word_embeddings')
        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = position_embedding.PositionEmbedding(
            embed_dim=hidden_size,
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=max_sequence_length,
            name='position_embedding')
        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._dropout_layer = tf.keras.layers.Dropout(rate=dropout_rate)

        self._embedding_projection_layer = tf.keras.layers.experimental.EinsumDense(
            '...x,xy->...y',
            output_shape=hidden_size,
            bias_axes='y',
            kernel_initializer=initializer,
            name='embedding_projection')

        self._self_attention_mask_layer = layers.SelfAttentionMask()

        self._transformer_layers = []
        print('!!!! building transformer layers !!!')
        logging.info('!!!! building transformer layers !!!')
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None

            group_size = num_layers // net2net_layers if net2net_layers is not None else None
            layer_net2net_ratio = None if (
                net2net_layers is None
                or i % group_size != 0) else net2net_ratio

            group_size = num_layers // lightatt_layers if lightatt_layers is not None else None
            use_lightatt = False if (lightatt_layers is None
                                     or i % group_size !=
                                     (group_size - 1)) else True

            logging.info(i)
            logging.info(layer_net2net_ratio)
            logging.info(use_lightatt)
            layer = transformer_layer.TransformerLayer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i,
                use_lightatt=use_lightatt,
                net2net_ratio=layer_net2net_ratio)
            self._transformer_layers.append(layer)
        print('!!!! finish building transformer layers !!!')
        logging.info('!!!! finish building transformer layers !!!')

        self._squeeze_layer = tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')

        nocls = input_pool_name != 'concat'
        input_pool_size = 1 if input_pool_name is None else input_pool_size
        self._mask_resolution_layer = resolution_layer.MaskPoolLayer(
            input_pool_size, nocls=nocls, name='mask_resolution')
        self._embed_resolution_layer = resolution_layer.EmbedPoolLayer(
            hidden_size,
            input_pool_size,
            input_pool_name,
            name='embed_resolution')
예제 #10
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            inner_dim=3072,
            inner_activation=lambda x: tf.keras.activations.gelu(
                x, approximate=True),
            output_dropout=0.1,
            attention_dropout=0.1,
            pool_type='max',
            pool_stride=2,
            unpool_length=0,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            output_range=None,
            embedding_width=None,
            embedding_layer=None,
            norm_first=False,
            **kwargs):
        super().__init__(**kwargs)
        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=initializer,
                name='word_embeddings')
        else:
            self._embedding_layer = embedding_layer

        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')

        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=output_dropout, name='embedding_dropout')

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        self._embedding_projection = None
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')

        self._transformer_layers = []
        self._attention_mask_layer = layers.SelfAttentionMask(
            name='self_attention_mask')
        for i in range(num_layers):
            layer = layers.TransformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=output_range if i == num_layers - 1 else None,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')
        if isinstance(pool_stride, int):
            # TODO(b/197133196): Pooling layer can be shared.
            pool_strides = [pool_stride] * num_layers
        else:
            if len(pool_stride) != num_layers:
                raise ValueError(
                    'Lengths of pool_stride and num_layers are not equal.')
            pool_strides = pool_stride
        # TODO(crickwu): explore tf.keras.layers.serialize method.
        if pool_type == 'max':
            pool_cls = tf.keras.layers.MaxPooling1D
        elif pool_type == 'avg':
            pool_cls = tf.keras.layers.AveragePooling1D
        else:
            raise ValueError('pool_type not supported.')
        self._att_input_pool_layers = []
        for layer_pool_stride in pool_strides:
            att_input_pool_layer = pool_cls(pool_size=layer_pool_stride,
                                            strides=layer_pool_stride,
                                            padding='same',
                                            name='att_input_pool_layer')
            self._att_input_pool_layers.append(att_input_pool_layer)

        self._pool_strides = pool_strides  # This is a list here.
        self._unpool_length = unpool_length

        self._config = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
            'norm_first': norm_first,
            'pool_type': pool_type,
            'pool_stride': pool_stride,
            'unpool_length': unpool_length,
        }
    def __init__(
            self,
            vocab_size: int,
            attention_window: Union[List[int], int] = 512,
            global_attention_size: int = 0,
            pad_token_id: int = 1,
            hidden_size: int = 768,
            num_layers: int = 12,
            num_attention_heads: int = 12,
            max_sequence_length: int = 512,
            type_vocab_size: int = 16,
            inner_dim: int = 3072,
            inner_activation: Callable[..., Any] = _approx_gelu,
            output_dropout: float = 0.1,
            attention_dropout: float = 0.1,
            initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            output_range: Optional[int] = None,
            embedding_width: Optional[int] = None,
            embedding_layer: Optional[tf.keras.layers.Layer] = None,
            norm_first: bool = False,
            **kwargs):
        super().__init__(**kwargs)
        # Longformer args
        self._attention_window = attention_window
        self._global_attention_size = global_attention_size
        self._pad_token_id = pad_token_id

        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=initializer,
                name='word_embeddings')
        else:
            self._embedding_layer = embedding_layer

        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')

        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=output_dropout, name='embedding_dropout')

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        self._embedding_projection = None
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')

        self._transformer_layers = []
        self._attention_mask_layer = layers.SelfAttentionMask(
            name='self_attention_mask')
        for i in range(num_layers):
            layer = LongformerEncoderBlock(
                global_attention_size=global_attention_size,
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                attention_window=attention_window[i],
                layer_id=i,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=output_range if i == num_layers - 1 else None,
                kernel_initializer=initializer,
                name=f'transformer/layer_{i}')
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')

        self._config = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
            'norm_first': norm_first,
            'attention_window': attention_window,
            'global_attention_size': global_attention_size,
            'pad_token_id': pad_token_id,
        }
        self.inputs = dict(input_word_ids=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32),
                           input_mask=tf.keras.Input(shape=(None, ),
                                                     dtype=tf.int32),
                           input_type_ids=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32))
예제 #12
0
    def __init__(
            self,
            pooled_output_dim,
            pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            embedding_cls=None,
            embedding_cfg=None,
            embedding_data=None,
            num_hidden_instances=1,
            hidden_cls=layers.Transformer,
            hidden_cfg=None,
            return_all_layer_outputs=False,
            dict_outputs=False,
            **kwargs):
        self._self_setattr_tracking = False
        self._hidden_cls = hidden_cls
        self._hidden_cfg = hidden_cfg
        self._num_hidden_instances = num_hidden_instances
        self._pooled_output_dim = pooled_output_dim
        self._pooler_layer_initializer = pooler_layer_initializer
        self._embedding_cls = embedding_cls
        self._embedding_cfg = embedding_cfg
        self._embedding_data = embedding_data
        self._return_all_layer_outputs = return_all_layer_outputs
        self._dict_outputs = dict_outputs
        self._kwargs = kwargs

        if embedding_cls:
            if inspect.isclass(embedding_cls):
                self._embedding_network = embedding_cls(
                    **embedding_cfg) if embedding_cfg else embedding_cls()
            else:
                self._embedding_network = embedding_cls
            inputs = self._embedding_network.inputs
            embeddings, attention_mask = self._embedding_network(inputs)
        else:
            self._embedding_network = None
            seq_length = embedding_cfg.get('seq_length', None)
            word_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                             dtype=tf.int32,
                                             name='input_word_ids')
            mask = tf.keras.layers.Input(shape=(seq_length, ),
                                         dtype=tf.int32,
                                         name='input_mask')
            type_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                             dtype=tf.int32,
                                             name='input_type_ids')
            inputs = [word_ids, mask, type_ids]

            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=embedding_cfg['vocab_size'],
                embedding_width=embedding_cfg['hidden_size'],
                initializer=embedding_cfg['initializer'],
                name='word_embeddings')

            word_embeddings = self._embedding_layer(word_ids)

            # Always uses dynamic slicing for simplicity.
            self._position_embedding_layer = keras_nlp.PositionEmbedding(
                initializer=embedding_cfg['initializer'],
                max_length=embedding_cfg['max_seq_length'],
                name='position_embedding')
            position_embeddings = self._position_embedding_layer(
                word_embeddings)

            self._type_embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=embedding_cfg['type_vocab_size'],
                embedding_width=embedding_cfg['hidden_size'],
                initializer=embedding_cfg['initializer'],
                use_one_hot=True,
                name='type_embeddings')
            type_embeddings = self._type_embedding_layer(type_ids)

            embeddings = tf.keras.layers.Add()(
                [word_embeddings, position_embeddings, type_embeddings])

            self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
                name='embeddings/layer_norm',
                axis=-1,
                epsilon=1e-12,
                dtype=tf.float32)
            embeddings = self._embedding_norm_layer(embeddings)

            embeddings = (tf.keras.layers.Dropout(
                rate=embedding_cfg['dropout_rate'])(embeddings))

            attention_mask = layers.SelfAttentionMask()([embeddings, mask])

        data = embeddings

        layer_output_data = []
        self._hidden_layers = []
        for _ in range(num_hidden_instances):
            if inspect.isclass(hidden_cls):
                layer = hidden_cls(
                    **hidden_cfg) if hidden_cfg else hidden_cls()
            else:
                layer = hidden_cls
            data = layer([data, attention_mask])
            layer_output_data.append(data)
            self._hidden_layers.append(layer)

        first_token_tensor = (
            tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
                layer_output_data[-1]))
        self._pooler_layer = tf.keras.layers.Dense(
            units=pooled_output_dim,
            activation='tanh',
            kernel_initializer=pooler_layer_initializer,
            name='cls_transform')
        cls_output = self._pooler_layer(first_token_tensor)

        if dict_outputs:
            outputs = dict(
                sequence_output=layer_output_data[-1],
                pooled_output=cls_output,
                encoder_outputs=layer_output_data,
            )
        elif return_all_layer_outputs:
            outputs = [layer_output_data, cls_output]
        else:
            outputs = [layer_output_data[-1], cls_output]

        super(EncoderScaffold, self).__init__(inputs=inputs,
                                              outputs=outputs,
                                              **kwargs)

        logging.info('EncoderScaffold configs: %s', self.get_config())
예제 #13
0
  def test_serialize_deserialize(self):
    hidden_size = 32
    sequence_length = 21
    vocab_size = 57

    # Build an embedding network to swap in for the default network. This one
    # will have 2 inputs (mask and word_ids) instead of 3, and won't use
    # positional embeddings.

    word_ids = tf.keras.layers.Input(
        shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
    mask = tf.keras.layers.Input(
        shape=(sequence_length,), dtype=tf.int32, name="input_mask")
    embedding_layer = layers.OnDeviceEmbedding(
        vocab_size=vocab_size,
        embedding_width=hidden_size,
        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
        name="word_embeddings")
    word_embeddings = embedding_layer(word_ids)
    attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])
    network = tf.keras.Model([word_ids, mask],
                             [word_embeddings, attention_mask])

    hidden_cfg = {
        "num_attention_heads":
            2,
        "intermediate_size":
            3072,
        "intermediate_activation":
            activations.gelu,
        "dropout_rate":
            0.1,
        "attention_dropout_rate":
            0.1,
        "kernel_initializer":
            tf.keras.initializers.TruncatedNormal(stddev=0.02),
    }

    # Create a small EncoderScaffold for testing.
    test_network = encoder_scaffold.EncoderScaffold(
        num_hidden_instances=3,
        pooled_output_dim=hidden_size,
        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=0.02),
        hidden_cfg=hidden_cfg,
        embedding_cls=network,
        embedding_data=embedding_layer.embeddings)

    # Create another network object from the first object's config.
    new_network = encoder_scaffold.EncoderScaffold.from_config(
        test_network.get_config())

    # Validate that the config can be forced to JSON.
    _ = new_network.to_json()

    # If the serialization was successful, the new config should match the old.
    self.assertAllEqual(test_network.get_config(), new_network.get_config())

    # Create a model based off of the old and new networks:
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)

    data, pooled = new_network([word_ids, mask])
    new_model = tf.keras.Model([word_ids, mask], [data, pooled])

    data, pooled = test_network([word_ids, mask])
    model = tf.keras.Model([word_ids, mask], [data, pooled])

    # Copy the weights between models.
    new_model.set_weights(model.get_weights())

    # Invoke the models.
    batch_size = 3
    word_id_data = np.random.randint(
        vocab_size, size=(batch_size, sequence_length))
    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
    data, cls = model.predict([word_id_data, mask_data])
    new_data, new_cls = new_model.predict([word_id_data, mask_data])

    # The output should be equal.
    self.assertAllEqual(data, new_data)
    self.assertAllEqual(cls, new_cls)

    # We should not be able to get a reference to the embedding data.
    with self.assertRaisesRegex(RuntimeError, ".*does not have a reference.*"):
      new_network.get_embedding_table()
    def __init__(
            self,
            vocab_size,
            embedding_width=128,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'embedding_width': embedding_width,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
        }

        word_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(None, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size
        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = keras_nlp.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = self._position_embedding_layer(word_embeddings)

        type_embeddings = (layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')(type_ids))

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))
        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            embeddings = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')(embeddings)

        data = embeddings
        attention_mask = layers.SelfAttentionMask()([data, mask])
        shared_layer = keras_nlp.TransformerEncoderBlock(
            num_attention_heads=num_attention_heads,
            inner_dim=intermediate_size,
            inner_activation=activation,
            output_dropout=dropout_rate,
            attention_dropout=attention_dropout_rate,
            kernel_initializer=initializer,
            name='transformer')
        for _ in range(num_layers):
            data = shared_layer([data, attention_mask])

        first_token_tensor = (tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data))
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)

        super(AlbertTransformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=[data, cls_output],
                             **kwargs)
예제 #15
0
    def __init__(
            self,
            vocab_size: int,
            hidden_size: int = 768,
            num_layers: int = 12,
            num_attention_heads: int = 12,
            max_sequence_length: int = 512,
            type_vocab_size: int = 16,
            inner_dim: int = 3072,
            inner_activation: _Activation = _approx_gelu,
            output_dropout: float = 0.1,
            attention_dropout: float = 0.1,
            pool_type: str = _MAX,
            pool_stride: int = 2,
            unpool_length: int = 0,
            initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            output_range: Optional[int] = None,
            embedding_width: Optional[int] = None,
            embedding_layer: Optional[tf.keras.layers.Layer] = None,
            norm_first: bool = False,
            transformer_cls: Union[
                str, tf.keras.layers.Layer] = layers.TransformerEncoderBlock,
            share_rezero: bool = True,
            **kwargs):
        super().__init__(**kwargs)
        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=tf_utils.clone_initializer(initializer),
                name='word_embeddings')
        else:
            self._embedding_layer = embedding_layer

        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=tf_utils.clone_initializer(initializer),
            max_length=max_sequence_length,
            name='position_embedding')

        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=tf_utils.clone_initializer(initializer),
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=output_dropout, name='embedding_dropout')

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        self._embedding_projection = None
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=tf_utils.clone_initializer(initializer),
                name='embedding_projection')

        self._transformer_layers = []
        self._attention_mask_layer = layers.SelfAttentionMask(
            name='self_attention_mask')
        # Will raise an error if the string is not supported.
        if isinstance(transformer_cls, str):
            transformer_cls = _str2transformer_cls[transformer_cls]
        for i in range(num_layers):
            layer = transformer_cls(
                num_attention_heads=num_attention_heads,
                intermediate_size=inner_dim,
                inner_dim=inner_dim,
                intermediate_activation=inner_activation,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=output_range if i == num_layers - 1 else None,
                kernel_initializer=tf_utils.clone_initializer(initializer),
                share_rezero=share_rezero,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=tf_utils.clone_initializer(initializer),
            name='pooler_transform')
        if isinstance(pool_stride, int):
            # TODO(b/197133196): Pooling layer can be shared.
            pool_strides = [pool_stride] * num_layers
        else:
            if len(pool_stride) != num_layers:
                raise ValueError(
                    'Lengths of pool_stride and num_layers are not equal.')
            pool_strides = pool_stride
        # TODO(crickwu): explore tf.keras.layers.serialize method.
        if pool_type == _MAX:
            pool_cls = tf.keras.layers.MaxPooling1D
        elif pool_type == _AVG:
            pool_cls = tf.keras.layers.AveragePooling1D
        elif pool_type == _TRUNCATED_AVG:
            # TODO(b/203665205): unpool_length should be implemented.
            if unpool_length != 0:
                raise ValueError(
                    'unpool_length is not supported by truncated_avg now.')
        else:
            raise ValueError('pool_type not supported.')

        if pool_type in (_MAX, _AVG):
            self._att_input_pool_layers = []
            for layer_pool_stride in pool_strides:
                att_input_pool_layer = pool_cls(pool_size=layer_pool_stride,
                                                strides=layer_pool_stride,
                                                padding='same',
                                                name='att_input_pool_layer')
                self._att_input_pool_layers.append(att_input_pool_layer)

        self._max_sequence_length = max_sequence_length
        self._pool_strides = pool_strides  # This is a list here.
        self._unpool_length = unpool_length
        self._pool_type = pool_type

        self._config = {
            'vocab_size':
            vocab_size,
            'hidden_size':
            hidden_size,
            'num_layers':
            num_layers,
            'num_attention_heads':
            num_attention_heads,
            'max_sequence_length':
            max_sequence_length,
            'type_vocab_size':
            type_vocab_size,
            'inner_dim':
            inner_dim,
            'inner_activation':
            tf.keras.activations.serialize(activation),
            'output_dropout':
            output_dropout,
            'attention_dropout':
            attention_dropout,
            'initializer':
            tf.keras.initializers.serialize(initializer),
            'output_range':
            output_range,
            'embedding_width':
            embedding_width,
            'embedding_layer':
            embedding_layer,
            'norm_first':
            norm_first,
            'pool_type':
            pool_type,
            'pool_stride':
            pool_stride,
            'unpool_length':
            unpool_length,
            'transformer_cls':
            _transformer_cls2str.get(transformer_cls, str(transformer_cls))
        }

        self.inputs = dict(input_word_ids=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32),
                           input_mask=tf.keras.Input(shape=(None, ),
                                                     dtype=tf.int32),
                           input_type_ids=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32))
예제 #16
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            float_dtype='float32',
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'float_dtype': float_dtype,
        }

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=max_sequence_length)
        position_embeddings = self._position_embedding_layer(word_embeddings)

        type_embeddings = (layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')(type_ids))

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate,
                                              dtype=tf.float32)(embeddings))

        if float_dtype == 'float16':
            embeddings = tf.cast(embeddings, tf.float16)

        data = embeddings
        attention_mask = layers.SelfAttentionMask()([data, mask])
        for i in range(num_layers):
            layer = layers.Transformer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                kernel_initializer=initializer,
                dtype=float_dtype,
                name='transformer/layer_%d' % i)
            data = layer([data, attention_mask])

        first_token_tensor = (tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data))
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)

        super(TransformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=[data, cls_output],
                             **kwargs)
예제 #17
0
  def __init__(
      self,
      num_output_classes,
      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
          stddev=0.02),
      embedding_cls=None,
      embedding_cfg=None,
      embedding_data=None,
      num_hidden_instances=1,
      hidden_cls=layers.Transformer,
      hidden_cfg=None,
      **kwargs):
    print(embedding_cfg)
    self._self_setattr_tracking = False
    self._hidden_cls = hidden_cls
    self._hidden_cfg = hidden_cfg
    self._num_hidden_instances = num_hidden_instances
    self._num_output_classes = num_output_classes
    self._classification_layer_initializer = classification_layer_initializer
    self._embedding_cls = embedding_cls
    self._embedding_cfg = embedding_cfg
    self._embedding_data = embedding_data
    self._kwargs = kwargs

    if embedding_cls:
      if inspect.isclass(embedding_cls):
        self._embedding_network = embedding_cls(embedding_cfg)
      else:
        self._embedding_network = embedding_cls
      inputs = self._embedding_network.inputs
      embeddings, mask = self._embedding_network(inputs)
    else:
      self._embedding_network = None
      word_ids = tf.keras.layers.Input(
          shape=(embedding_cfg['seq_length'],),
          dtype=tf.int32,
          name='input_word_ids')
      mask = tf.keras.layers.Input(
          shape=(embedding_cfg['seq_length'],),
          dtype=tf.int32,
          name='input_mask')
      type_ids = tf.keras.layers.Input(
          shape=(embedding_cfg['seq_length'],),
          dtype=tf.int32,
          name='input_type_ids')
      inputs = [word_ids, mask, type_ids]

      self._embedding_layer = layers.OnDeviceEmbedding(
          vocab_size=embedding_cfg['vocab_size'],
          embedding_width=embedding_cfg['hidden_size'],
          initializer=embedding_cfg['initializer'],
          name='word_embeddings')

      word_embeddings = self._embedding_layer(word_ids)

      # Always uses dynamic slicing for simplicity.
      self._position_embedding_layer = layers.PositionEmbedding(
          initializer=embedding_cfg['initializer'],
          use_dynamic_slicing=True,
          max_sequence_length=embedding_cfg['max_seq_length'])
      position_embeddings = self._position_embedding_layer(word_embeddings)

      type_embeddings = (
          layers.OnDeviceEmbedding(
              vocab_size=embedding_cfg['type_vocab_size'],
              embedding_width=embedding_cfg['hidden_size'],
              initializer=embedding_cfg['initializer'],
              use_one_hot=True,
              name='type_embeddings')(type_ids))

      embeddings = tf.keras.layers.Add()(
          [word_embeddings, position_embeddings, type_embeddings])
      embeddings = (
          tf.keras.layers.LayerNormalization(
              name='embeddings/layer_norm',
              axis=-1,
              epsilon=1e-12,
              dtype=tf.float32)(embeddings))
      embeddings = (
          tf.keras.layers.Dropout(
              rate=embedding_cfg['dropout_rate'])(embeddings))

    attention_mask = layers.SelfAttentionMask()([embeddings, mask])
    data = embeddings

    for _ in range(num_hidden_instances):
      if inspect.isclass(hidden_cls):
        layer = self._hidden_cls(**hidden_cfg)
      else:
        layer = self._hidden_cls
      data = layer([data, attention_mask])

    first_token_tensor = (
        tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data)
    )
    cls_output = tf.keras.layers.Dense(
        units=num_output_classes,
        activation='tanh',
        kernel_initializer=classification_layer_initializer,
        name='cls_transform')(
            first_token_tensor)

    super(EncoderScaffold, self).__init__(
        inputs=inputs, outputs=[data, cls_output], **kwargs)
예제 #18
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,  # FIXME: hidden_size per head should be even!
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            inner_dim=3072,
            inner_activation=lambda x: tf.keras.activations.gelu(
                x, approximate=True),
            output_dropout=0.1,
            attention_dropout=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            output_range=None,
            embedding_width=None,
            embedding_layer=None,
            norm_first=False,
            **kwargs):
        if 'intermediate_size' in kwargs:
            inner_dim = kwargs['intermediate_size']
            del kwargs['intermediate_size']
        if 'activation' in kwargs:
            inner_activation = kwargs['activation']
            del kwargs['activation']
        if 'dropout_rate' in kwargs:
            output_dropout = kwargs['dropout_rate']
            del kwargs['dropout_rate']
        if 'attention_dropout_rate' in kwargs:
            attention_dropout = kwargs['attention_dropout_rate']
            del kwargs['attention_dropout_rate']

        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        word_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(None, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            embedding_layer_inst = layers.on_device_embedding.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=initializer,
                name='word_embeddings')
        else:
            embedding_layer_inst = embedding_layer
        word_embeddings = embedding_layer_inst(word_ids)

        # Roformer does not need a position embedding layer
        type_embedding_layer = layers.on_device_embedding.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')
        type_embeddings = type_embedding_layer(type_ids)

        # Roformer does not have absolute position embedding
        embeddings = tf.keras.layers.Add()([word_embeddings, type_embeddings])

        embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        embeddings = embedding_norm_layer(embeddings)
        embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')
            embeddings = embedding_projection(embeddings)
        else:
            embedding_projection = None

        transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()(data, mask)
        encoder_outputs = []
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None
            layer = roformer_encoder_block.RoformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                q_max_sequence_length=max_sequence_length,
                kv_max_sequence_length=max_sequence_length,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='roformer/layer_%d' % i)
            transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        last_encoder_output = encoder_outputs[-1]
        # Applying a tf.slice op (through subscript notation) to a Keras tensor
        # like this will create a SliceOpLambda layer. This is better than a Lambda
        # layer with Python code, because that is fundamentally less portable.
        first_token_tensor = last_encoder_output[:, 0, :]
        pooler_layer = tf.keras.layers.Dense(units=hidden_size,
                                             activation='tanh',
                                             kernel_initializer=initializer,
                                             name='pooler_transform')
        cls_output = pooler_layer(first_token_tensor)

        outputs = dict(
            sequence_output=encoder_outputs[-1],
            pooled_output=cls_output,
            encoder_outputs=encoder_outputs,
        )

        # Once we've created the network using the Functional API, we call
        # super().__init__ as though we were invoking the Functional API Model
        # constructor, resulting in this object having all the properties of a model
        # created using the Functional API. Once super().__init__ is called, we
        # can assign attributes to `self` - note that all `self` assignments are
        # below this line.
        super(RoformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=outputs,
                             **kwargs)

        config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
            'norm_first': norm_first,
        }

        # We are storing the config dict as a namedtuple here to ensure checkpoint
        # compatibility with an earlier version of this model which did not track
        # the config dict attribute. TF does not track immutable attrs which
        # do not contain Trackables, so by creating a config namedtuple instead of
        # a dict we avoid tracking it.
        config_cls = collections.namedtuple('Config', config_dict.keys())
        self._config = config_cls(**config_dict)
        self._pooler_layer = pooler_layer
        self._transformer_layers = transformer_layers
        self._embedding_norm_layer = embedding_norm_layer
        self._embedding_layer = embedding_layer_inst
        # self._position_embedding_layer = position_embedding_layer
        self._position_embedding_layer = None
        self._type_embedding_layer = type_embedding_layer
        if embedding_projection is not None:
            self._embedding_projection = embedding_projection
예제 #19
0
    def __init__(
            self,
            vocab_size: int,
            hidden_size: int = 768,
            num_layers: int = 12,
            num_attention_heads: int = 12,
            max_sequence_length: int = 512,
            type_vocab_size: int = 16,
            inner_dim: int = 3072,
            inner_activation: _Activation = _approx_gelu,
            output_dropout: float = 0.1,
            attention_dropout: float = 0.1,
            token_loss_init_value: float = 10.0,
            token_loss_beta: float = 0.995,
            token_keep_k: int = 256,
            token_allow_list: Tuple[int, ...] = (100, 101, 102, 103),
            token_deny_list: Tuple[int, ...] = (0, ),
            initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            output_range: Optional[int] = None,
            embedding_width: Optional[int] = None,
            embedding_layer: Optional[tf.keras.layers.Layer] = None,
            norm_first: bool = False,
            with_dense_inputs: bool = False,
            **kwargs):
        # Pops kwargs that are used in V1 implementation.
        if 'dict_outputs' in kwargs:
            kwargs.pop('dict_outputs')
        if 'return_all_encoder_outputs' in kwargs:
            kwargs.pop('return_all_encoder_outputs')
        if 'intermediate_size' in kwargs:
            inner_dim = kwargs.pop('intermediate_size')
        if 'activation' in kwargs:
            inner_activation = kwargs.pop('activation')
        if 'dropout_rate' in kwargs:
            output_dropout = kwargs.pop('dropout_rate')
        if 'attention_dropout_rate' in kwargs:
            attention_dropout = kwargs.pop('attention_dropout_rate')
        super().__init__(**kwargs)

        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=tf_utils.clone_initializer(initializer),
                name='word_embeddings')
        else:
            self._embedding_layer = embedding_layer

        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=tf_utils.clone_initializer(initializer),
            max_length=max_sequence_length,
            name='position_embedding')

        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=tf_utils.clone_initializer(initializer),
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=output_dropout, name='embedding_dropout')

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        self._embedding_projection = None
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=tf_utils.clone_initializer(initializer),
                name='embedding_projection')

        # The first 999 tokens are special tokens such as [PAD], [CLS], [SEP].
        # We want to always mask [PAD], and always not to maks [CLS], [SEP].
        init_importance = tf.constant(token_loss_init_value,
                                      shape=(vocab_size))
        if token_allow_list:
            init_importance = tf.tensor_scatter_nd_update(
                tensor=init_importance,
                indices=[[x] for x in token_allow_list],
                updates=[1.0e4 for x in token_allow_list])
        if token_deny_list:
            init_importance = tf.tensor_scatter_nd_update(
                tensor=init_importance,
                indices=[[x] for x in token_deny_list],
                updates=[-1.0e4 for x in token_deny_list])
        self._token_importance_embed = layers.TokenImportanceWithMovingAvg(
            vocab_size=vocab_size,
            init_importance=init_importance,
            moving_average_beta=token_loss_beta)

        self._token_separator = layers.SelectTopK(top_k=token_keep_k)
        self._transformer_layers = []
        self._num_layers = num_layers
        self._attention_mask_layer = layers.SelfAttentionMask(
            name='self_attention_mask')
        for i in range(num_layers):
            layer = layers.TransformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=output_range if i == num_layers - 1 else None,
                kernel_initializer=tf_utils.clone_initializer(initializer),
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=tf_utils.clone_initializer(initializer),
            name='pooler_transform')

        self._config = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'token_loss_init_value': token_loss_init_value,
            'token_loss_beta': token_loss_beta,
            'token_keep_k': token_keep_k,
            'token_allow_list': token_allow_list,
            'token_deny_list': token_deny_list,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
            'norm_first': norm_first,
            'with_dense_inputs': with_dense_inputs,
        }
        if with_dense_inputs:
            self.inputs = dict(
                input_word_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                input_mask=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                input_type_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                dense_inputs=tf.keras.Input(shape=(None, embedding_width),
                                            dtype=tf.float32),
                dense_mask=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                dense_type_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
            )
        else:
            self.inputs = dict(input_word_ids=tf.keras.Input(shape=(None, ),
                                                             dtype=tf.int32),
                               input_mask=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32),
                               input_type_ids=tf.keras.Input(shape=(None, ),
                                                             dtype=tf.int32))
예제 #20
0
    def __init__(self,
                 network,
                 bert_config,
                 initializer='glorot_uniform',
                 seq_length=128,
                 use_pointing=True,
                 is_training=True):
        """Creates Felix Tagger.

    Setting up all of the layers needed for call.

    Args:
      network: An encoder network, which should output a sequence of hidden
               states.
      bert_config: A config file which in addition to the  BertConfig values
      also includes: num_classes, hidden_dropout_prob, and query_transformer.
      initializer: The initializer (if any) to use in the classification
                   networks. Defaults to a Glorot uniform initializer.
      seq_length:  Maximum sequence length.
      use_pointing: Whether a pointing network is used.
      is_training: The model is being trained.
    """

        super(FelixTagger, self).__init__()
        self._network = network
        self._seq_length = seq_length
        self._bert_config = bert_config
        self._use_pointing = use_pointing
        self._is_training = is_training

        self._tag_logits_layer = tf.keras.layers.Dense(
            self._bert_config.num_classes)
        if not self._use_pointing:
            return

        # An arbitrary heuristic (sqrt vocab size) for the tag embedding dimension.
        self._tag_embedding_layer = tf.keras.layers.Embedding(
            self._bert_config.num_classes,
            int(math.ceil(math.sqrt(self._bert_config.num_classes))),
            input_length=seq_length)

        self._position_embedding_layer = layers.PositionEmbedding(
            max_length=seq_length)
        self._edit_tagged_sequence_output_layer = tf.keras.layers.Dense(
            self._bert_config.hidden_size, activation=activations.gelu)

        if self._bert_config.query_transformer:
            self._self_attention_mask_layer = layers.SelfAttentionMask()
            self._transformer_query_layer = layers.TransformerEncoderBlock(
                num_attention_heads=self._bert_config.num_attention_heads,
                inner_dim=self._bert_config.intermediate_size,
                inner_activation=activations.gelu,
                output_dropout=self._bert_config.hidden_dropout_prob,
                attention_dropout=self._bert_config.hidden_dropout_prob,
                output_range=seq_length,
            )

        self._query_embeddings_layer = tf.keras.layers.Dense(
            self._bert_config.query_size)

        self._key_embeddings_layer = tf.keras.layers.Dense(
            self._bert_config.query_size)
예제 #21
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            return_all_encoder_outputs=False,
            output_range=None,
            embedding_width=None,
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'return_all_encoder_outputs': return_all_encoder_outputs,
            'output_range': output_range,
            'embedding_width': embedding_width,
        }

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size
        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = self._position_embedding_layer(word_embeddings)
        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')
        type_embeddings = self._type_embedding_layer(type_ids)

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])

        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')
            embeddings = self._embedding_projection(embeddings)

        self._transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()([data, mask])
        encoder_outputs = []
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None
            layer = layers.Transformer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        first_token_tensor = (
            tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
                encoder_outputs[-1]))
        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')
        cls_output = self._pooler_layer(first_token_tensor)

        if return_all_encoder_outputs:
            outputs = [encoder_outputs, cls_output]
        else:
            outputs = [encoder_outputs[-1], cls_output]

        super(TransformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=outputs,
                             **kwargs)
    def test_network_invocation(self):
        hidden_size = 32
        sequence_length = 21
        vocab_size = 57

        # Build an embedding network to swap in for the default network. This one
        # will have 2 inputs (mask and word_ids) instead of 3, and won't use
        # positional embeddings.

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name="input_word_ids")
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name="input_mask")
        embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=hidden_size,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            name="word_embeddings")
        word_embeddings = embedding_layer(word_ids)
        attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])
        network = tf.keras.Model([word_ids, mask],
                                 [word_embeddings, attention_mask])

        hidden_cfg = {
            "num_attention_heads":
            2,
            "intermediate_size":
            3072,
            "intermediate_activation":
            activations.gelu,
            "dropout_rate":
            0.1,
            "attention_dropout_rate":
            0.1,
            "kernel_initializer":
            tf.keras.initializers.TruncatedNormal(stddev=0.02),
        }

        # Create a small EncoderScaffold for testing.
        test_network = encoder_scaffold.EncoderScaffold(
            num_hidden_instances=3,
            pooled_output_dim=hidden_size,
            pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            hidden_cfg=hidden_cfg,
            embedding_cls=network,
            embedding_data=embedding_layer.embeddings)

        # Create the inputs (note that the first dimension is implicit).
        word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        data, pooled = test_network([word_ids, mask])

        # Create a model based off of this network:
        model = tf.keras.Model([word_ids, mask], [data, pooled])

        # Invoke the model. We can't validate the output data here (the model is too
        # complex) but this will catch structural runtime errors.
        batch_size = 3
        word_id_data = np.random.randint(vocab_size,
                                         size=(batch_size, sequence_length))
        mask_data = np.random.randint(2, size=(batch_size, sequence_length))
        _ = model.predict([word_id_data, mask_data])

        # Test that we can get the embedding data that we passed to the object. This
        # is necessary to support standard language model training.
        self.assertIs(embedding_layer.embeddings,
                      test_network.get_embedding_table())
예제 #23
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_sizes=(16, ),
            num_float_features=0,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            return_all_encoder_outputs=False,
            bert_init_ckpt=None,
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self._self_setattr_tracking = False
        num_type_features = len(type_vocab_sizes)
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_sizes': type_vocab_sizes,
            'num_type_features': num_type_features,
            'num_float_features': num_float_features,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'return_all_encoder_outputs': return_all_encoder_outputs,
        }

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        all_inputs = [word_ids, mask]
        if num_type_features:
            type_ids = tf.keras.layers.Input(shape=(sequence_length,
                                                    num_type_features),
                                             dtype=tf.int32,
                                             name='input_type_ids')
            all_inputs.append(type_ids)
        if num_float_features:
            float_features = tf.keras.layers.Input(shape=(sequence_length,
                                                          num_float_features),
                                                   dtype=tf.float32,
                                                   name='float_features')
            all_inputs.append(float_features)

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = modeling.layers.PositionEmbedding(
            initializer=initializer, max_length=max_sequence_length)
        position_embeddings = self._position_embedding_layer(word_embeddings)
        all_embeddings = [word_embeddings, position_embeddings]

        if num_type_features:
            type_embeddings = [(layers.OnDeviceEmbedding(
                vocab_size=type_vocab_sizes[idx],
                embedding_width=hidden_size,
                initializer=initializer,
                use_one_hot=True,
                name='type_embeddings_{}'.format(idx))(type_ids[..., idx]))
                               for idx in range(num_type_features)]
            all_embeddings += type_embeddings

        if num_float_features:
            float_embeddings = [
                (
                    tf.keras.layers.Dense(
                        hidden_size, name='float_features_{}'.format(idx))(
                            # Expanding the last dim here is important.
                            float_features[..., idx, None]))
                for idx in range(num_float_features)
            ]
            all_embeddings += float_embeddings

        embeddings = tf.keras.layers.Add()(all_embeddings)
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))

        self._transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()([data, mask])
        encoder_outputs = []
        for i in range(num_layers):
            layer = layers.Transformer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                kernel_initializer=initializer,
                name='model/layer_with_weights-%d' % (i + 4))
            self._transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        first_token_tensor = (
            tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
                encoder_outputs[-1]))
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)

        if return_all_encoder_outputs:
            outputs = [encoder_outputs, cls_output]
        else:
            outputs = [encoder_outputs[-1], cls_output]
        super(TransformerEncoder, self).__init__(inputs=all_inputs,
                                                 outputs=outputs,
                                                 **kwargs)

        if bert_init_ckpt and learner_flags.INIT_CHECKPOINT.value is None:
            self.init_weights(bert_init_ckpt)
예제 #24
0
    def __init__(
            self,
            vocab_size: int,
            hidden_size: int = 768,
            num_layers: int = 12,
            num_attention_heads: int = 12,
            max_sequence_length: int = 512,
            type_vocab_size: int = 16,
            inner_dim: int = 3072,
            inner_activation: _Activation = _approx_gelu,
            output_dropout: float = 0.1,
            attention_dropout: float = 0.1,
            initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            output_range: Optional[int] = None,
            embedding_width: Optional[int] = None,
            embedding_layer: Optional[tf.keras.layers.Layer] = None,
            norm_first: bool = False,
            with_dense_inputs: bool = False,
            **kwargs):
        # Pops kwargs that are used in V1 implementation.
        if 'dict_outputs' in kwargs:
            kwargs.pop('dict_outputs')
        if 'return_all_encoder_outputs' in kwargs:
            kwargs.pop('return_all_encoder_outputs')
        if 'intermediate_size' in kwargs:
            inner_dim = kwargs.pop('intermediate_size')
        if 'activation' in kwargs:
            inner_activation = kwargs.pop('activation')
        if 'dropout_rate' in kwargs:
            output_dropout = kwargs.pop('dropout_rate')
        if 'attention_dropout_rate' in kwargs:
            attention_dropout = kwargs.pop('attention_dropout_rate')
        super().__init__(**kwargs)

        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=initializer,
                name='word_embeddings')
        else:
            self._embedding_layer = embedding_layer

        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')

        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=output_dropout, name='embedding_dropout')

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        self._embedding_projection = None
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')

        self._transformer_layers = []
        self._attention_mask_layer = layers.SelfAttentionMask(
            name='self_attention_mask')
        for i in range(num_layers):
            layer = layers.TransformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=output_range if i == num_layers - 1 else None,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')

        self._config = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
            'norm_first': norm_first,
            'with_dense_inputs': with_dense_inputs,
        }
        if with_dense_inputs:
            self.inputs = dict(
                input_word_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                input_mask=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                input_type_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                dense_inputs=tf.keras.Input(shape=(None, embedding_width),
                                            dtype=tf.float32),
                dense_mask=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                dense_type_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
            )
        else:
            self.inputs = dict(input_word_ids=tf.keras.Input(shape=(None, ),
                                                             dtype=tf.int32),
                               input_mask=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32),
                               input_type_ids=tf.keras.Input(shape=(None, ),
                                                             dtype=tf.int32))
예제 #25
0
    def __init__(
            self,
            vocab_size,
            embedding_width=128,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            dict_outputs=False,
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        word_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(None, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size
        embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = position_embedding_layer(word_embeddings)

        type_embeddings = (layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')(type_ids))

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))
        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            embeddings = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')(embeddings)

        data = embeddings
        attention_mask = layers.SelfAttentionMask()(data, mask)
        shared_layer = layers.TransformerEncoderBlock(
            num_attention_heads=num_attention_heads,
            inner_dim=intermediate_size,
            inner_activation=activation,
            output_dropout=dropout_rate,
            attention_dropout=attention_dropout_rate,
            kernel_initializer=initializer,
            name='transformer')
        encoder_outputs = []
        for _ in range(num_layers):
            data = shared_layer([data, attention_mask])
            encoder_outputs.append(data)

        # Applying a tf.slice op (through subscript notation) to a Keras tensor
        # like this will create a SliceOpLambda layer. This is better than a Lambda
        # layer with Python code, because that is fundamentally less portable.
        first_token_tensor = data[:, 0, :]
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)
        if dict_outputs:
            outputs = dict(
                sequence_output=data,
                encoder_outputs=encoder_outputs,
                pooled_output=cls_output,
            )
        else:
            outputs = [data, cls_output]

        # b/164516224
        # Once we've created the network using the Functional API, we call
        # super().__init__ as though we were invoking the Functional API Model
        # constructor, resulting in this object having all the properties of a model
        # created using the Functional API. Once super().__init__ is called, we
        # can assign attributes to `self` - note that all `self` assignments are
        # below this line.
        super(AlbertEncoder, self).__init__(inputs=[word_ids, mask, type_ids],
                                            outputs=outputs,
                                            **kwargs)
        config_dict = {
            'vocab_size': vocab_size,
            'embedding_width': embedding_width,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
        }

        # We are storing the config dict as a namedtuple here to ensure checkpoint
        # compatibility with an earlier version of this model which did not track
        # the config dict attribute. TF does not track immutable attrs which
        # do not contain Trackables, so by creating a config namedtuple instead of
        # a dict we avoid tracking it.
        config_cls = collections.namedtuple('Config', config_dict.keys())
        self._config = config_cls(**config_dict)
        self._embedding_layer = embedding_layer
        self._position_embedding_layer = position_embedding_layer
예제 #26
0
        tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))

    # We project the 'embedding' output to 'hidden_size' if it is not already
    # 'hidden_size'.
    if embedding_width != hidden_size:
      self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
          '...x,xy->...y',
          output_shape=hidden_size,
          bias_axes='y',
          kernel_initializer=initializer,
          name='embedding_projection')
      embeddings = self._embedding_projection(embeddings)

    self._transformer_layers = []
    data = embeddings
    attention_mask = layers.SelfAttentionMask()([data, mask])
    encoder_outputs = []
    for i in range(num_layers):
      if i == num_layers - 1 and output_range is not None:
        transformer_output_range = output_range
      else:
        transformer_output_range = None
      layer = layers.Transformer(
          num_attention_heads=num_attention_heads,
          intermediate_size=intermediate_size,
          intermediate_activation=activation,
          dropout_rate=dropout_rate,
          attention_dropout_rate=attention_dropout_rate,
          output_range=transformer_output_range,
          kernel_initializer=initializer,
          name='transformer/layer_%d' % i)
예제 #27
0
  def __init__(self,
               pooled_output_dim,
               pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
                   stddev=0.02),
               embedding_cls=None,
               embedding_cfg=None,
               embedding_data=None,
               num_hidden_instances=1,
               hidden_cls=layers.Transformer,
               hidden_cfg=None,
               layer_norm_before_pooling=False,
               return_all_layer_outputs=False,
               dict_outputs=False,
               **kwargs):

    if embedding_cls:
      if inspect.isclass(embedding_cls):
        embedding_network = embedding_cls(
            **embedding_cfg) if embedding_cfg else embedding_cls()
      else:
        embedding_network = embedding_cls
      inputs = embedding_network.inputs
      embeddings, attention_mask = embedding_network(inputs)
      embedding_layer = None
      position_embedding_layer = None
      type_embedding_layer = None
      embedding_norm_layer = None
    else:
      embedding_network = None
      seq_length = embedding_cfg.get('seq_length', None)
      word_ids = tf.keras.layers.Input(
          shape=(seq_length,), dtype=tf.int32, name='input_word_ids')
      mask = tf.keras.layers.Input(
          shape=(seq_length,), dtype=tf.int32, name='input_mask')
      type_ids = tf.keras.layers.Input(
          shape=(seq_length,), dtype=tf.int32, name='input_type_ids')
      inputs = [word_ids, mask, type_ids]

      embedding_layer = keras_nlp.layers.OnDeviceEmbedding(
          vocab_size=embedding_cfg['vocab_size'],
          embedding_width=embedding_cfg['hidden_size'],
          initializer=embedding_cfg['initializer'],
          name='word_embeddings')

      word_embeddings = embedding_layer(word_ids)

      # Always uses dynamic slicing for simplicity.
      position_embedding_layer = keras_nlp.layers.PositionEmbedding(
          initializer=embedding_cfg['initializer'],
          max_length=embedding_cfg['max_seq_length'],
          name='position_embedding')
      position_embeddings = position_embedding_layer(word_embeddings)

      type_embedding_layer = keras_nlp.layers.OnDeviceEmbedding(
          vocab_size=embedding_cfg['type_vocab_size'],
          embedding_width=embedding_cfg['hidden_size'],
          initializer=embedding_cfg['initializer'],
          use_one_hot=True,
          name='type_embeddings')
      type_embeddings = type_embedding_layer(type_ids)

      embeddings = tf.keras.layers.Add()(
          [word_embeddings, position_embeddings, type_embeddings])

      embedding_norm_layer = tf.keras.layers.LayerNormalization(
          name='embeddings/layer_norm',
          axis=-1,
          epsilon=1e-12,
          dtype=tf.float32)
      embeddings = embedding_norm_layer(embeddings)

      embeddings = (
          tf.keras.layers.Dropout(
              rate=embedding_cfg['dropout_rate'])(embeddings))

      attention_mask = layers.SelfAttentionMask()([embeddings, mask])

    data = embeddings

    layer_output_data = []
    hidden_layers = []
    for _ in range(num_hidden_instances):
      if inspect.isclass(hidden_cls):
        layer = hidden_cls(**hidden_cfg) if hidden_cfg else hidden_cls()
      else:
        layer = hidden_cls
      data = layer([data, attention_mask])
      layer_output_data.append(data)
      hidden_layers.append(layer)

    if layer_norm_before_pooling:
      # Normalize the final output.
      output_layer_norm = tf.keras.layers.LayerNormalization(
          name='final_layer_norm',
          axis=-1,
          epsilon=1e-12)
      layer_output_data[-1] = output_layer_norm(layer_output_data[-1])

    last_layer_output = layer_output_data[-1]
    # Applying a tf.slice op (through subscript notation) to a Keras tensor
    # like this will create a SliceOpLambda layer. This is better than a Lambda
    # layer with Python code, because that is fundamentally less portable.
    first_token_tensor = last_layer_output[:, 0, :]
    pooler_layer = tf.keras.layers.Dense(
        units=pooled_output_dim,
        activation='tanh',
        kernel_initializer=pooler_layer_initializer,
        name='cls_transform')
    cls_output = pooler_layer(first_token_tensor)

    if dict_outputs:
      outputs = dict(
          sequence_output=layer_output_data[-1],
          pooled_output=cls_output,
          encoder_outputs=layer_output_data,
      )
    elif return_all_layer_outputs:
      outputs = [layer_output_data, cls_output]
    else:
      outputs = [layer_output_data[-1], cls_output]

    # b/164516224
    # Once we've created the network using the Functional API, we call
    # super().__init__ as though we were invoking the Functional API Model
    # constructor, resulting in this object having all the properties of a model
    # created using the Functional API. Once super().__init__ is called, we
    # can assign attributes to `self` - note that all `self` assignments are
    # below this line.
    super(EncoderScaffold, self).__init__(
        inputs=inputs, outputs=outputs, **kwargs)

    self._hidden_cls = hidden_cls
    self._hidden_cfg = hidden_cfg
    self._num_hidden_instances = num_hidden_instances
    self._pooled_output_dim = pooled_output_dim
    self._pooler_layer_initializer = pooler_layer_initializer
    self._embedding_cls = embedding_cls
    self._embedding_cfg = embedding_cfg
    self._embedding_data = embedding_data
    self._layer_norm_before_pooling = layer_norm_before_pooling
    self._return_all_layer_outputs = return_all_layer_outputs
    self._dict_outputs = dict_outputs
    self._kwargs = kwargs

    self._embedding_layer = embedding_layer
    self._embedding_network = embedding_network
    self._position_embedding_layer = position_embedding_layer
    self._type_embedding_layer = type_embedding_layer
    self._embedding_norm_layer = embedding_norm_layer
    self._embedding_network = embedding_network
    self._hidden_layers = hidden_layers
    if self._layer_norm_before_pooling:
      self._output_layer_norm = output_layer_norm
    self._pooler_layer = pooler_layer

    logging.info('EncoderScaffold configs: %s', self.get_config())
예제 #28
0
  def __init__(
      self,
      vocab_size,
      hidden_size=768,
      num_layers=12,
      num_attention_heads=12,
      max_sequence_length=512,
      type_vocab_size=16,
      inner_dim=3072,
      inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
      output_dropout=0.1,
      attention_dropout=0.1,
      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
      output_range=None,
      embedding_width=None,
      embedding_layer=None,
      norm_first=False,
      dict_outputs=False,
      return_all_encoder_outputs=False,
      **kwargs):
    if 'sequence_length' in kwargs:
      kwargs.pop('sequence_length')
      logging.warning('`sequence_length` is a deprecated argument to '
                      '`BertEncoder`, which has no effect for a while. Please '
                      'remove `sequence_length` argument.')

    # Handles backward compatible kwargs.
    if 'intermediate_size' in kwargs:
      inner_dim = kwargs.pop('intermediate_size')

    if 'activation' in kwargs:
      inner_activation = kwargs.pop('activation')

    if 'dropout_rate' in kwargs:
      output_dropout = kwargs.pop('dropout_rate')

    if 'attention_dropout_rate' in kwargs:
      attention_dropout = kwargs.pop('attention_dropout_rate')

    activation = tf.keras.activations.get(inner_activation)
    initializer = tf.keras.initializers.get(initializer)

    word_ids = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_word_ids')
    mask = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_mask')
    type_ids = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_type_ids')

    if embedding_width is None:
      embedding_width = hidden_size

    if embedding_layer is None:
      embedding_layer_inst = layers.OnDeviceEmbedding(
          vocab_size=vocab_size,
          embedding_width=embedding_width,
          initializer=initializer,
          name='word_embeddings')
    else:
      embedding_layer_inst = embedding_layer
    word_embeddings = embedding_layer_inst(word_ids)

    # Always uses dynamic slicing for simplicity.
    position_embedding_layer = layers.PositionEmbedding(
        initializer=initializer,
        max_length=max_sequence_length,
        name='position_embedding')
    position_embeddings = position_embedding_layer(word_embeddings)
    type_embedding_layer = layers.OnDeviceEmbedding(
        vocab_size=type_vocab_size,
        embedding_width=embedding_width,
        initializer=initializer,
        use_one_hot=True,
        name='type_embeddings')
    type_embeddings = type_embedding_layer(type_ids)

    embeddings = tf.keras.layers.Add()(
        [word_embeddings, position_embeddings, type_embeddings])

    embedding_norm_layer = tf.keras.layers.LayerNormalization(
        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)

    embeddings = embedding_norm_layer(embeddings)
    embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))

    # We project the 'embedding' output to 'hidden_size' if it is not already
    # 'hidden_size'.
    if embedding_width != hidden_size:
      embedding_projection = tf.keras.layers.experimental.EinsumDense(
          '...x,xy->...y',
          output_shape=hidden_size,
          bias_axes='y',
          kernel_initializer=initializer,
          name='embedding_projection')
      embeddings = embedding_projection(embeddings)
    else:
      embedding_projection = None

    transformer_layers = []
    data = embeddings
    attention_mask = layers.SelfAttentionMask()(data, mask)
    encoder_outputs = []
    for i in range(num_layers):
      if i == num_layers - 1 and output_range is not None:
        transformer_output_range = output_range
      else:
        transformer_output_range = None
      layer = layers.TransformerEncoderBlock(
          num_attention_heads=num_attention_heads,
          inner_dim=inner_dim,
          inner_activation=inner_activation,
          output_dropout=output_dropout,
          attention_dropout=attention_dropout,
          norm_first=norm_first,
          output_range=transformer_output_range,
          kernel_initializer=initializer,
          name='transformer/layer_%d' % i)
      transformer_layers.append(layer)
      data = layer([data, attention_mask])
      encoder_outputs.append(data)

    last_encoder_output = encoder_outputs[-1]
    # Applying a tf.slice op (through subscript notation) to a Keras tensor
    # like this will create a SliceOpLambda layer. This is better than a Lambda
    # layer with Python code, because that is fundamentally less portable.
    first_token_tensor = last_encoder_output[:, 0, :]
    pooler_layer = tf.keras.layers.Dense(
        units=hidden_size,
        activation='tanh',
        kernel_initializer=initializer,
        name='pooler_transform')
    cls_output = pooler_layer(first_token_tensor)

    outputs = dict(
        sequence_output=encoder_outputs[-1],
        pooled_output=cls_output,
        encoder_outputs=encoder_outputs,
    )

    if dict_outputs:
      super().__init__(
          inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
    else:
      cls_output = outputs['pooled_output']
      if return_all_encoder_outputs:
        encoder_outputs = outputs['encoder_outputs']
        outputs = [encoder_outputs, cls_output]
      else:
        sequence_output = outputs['sequence_output']
        outputs = [sequence_output, cls_output]
      super().__init__(  # pylint: disable=bad-super-call
          inputs=[word_ids, mask, type_ids],
          outputs=outputs,
          **kwargs)

    self._pooler_layer = pooler_layer
    self._transformer_layers = transformer_layers
    self._embedding_norm_layer = embedding_norm_layer
    self._embedding_layer = embedding_layer_inst
    self._position_embedding_layer = position_embedding_layer
    self._type_embedding_layer = type_embedding_layer
    if embedding_projection is not None:
      self._embedding_projection = embedding_projection

    config_dict = {
        'vocab_size': vocab_size,
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'num_attention_heads': num_attention_heads,
        'max_sequence_length': max_sequence_length,
        'type_vocab_size': type_vocab_size,
        'inner_dim': inner_dim,
        'inner_activation': tf.keras.activations.serialize(activation),
        'output_dropout': output_dropout,
        'attention_dropout': attention_dropout,
        'initializer': tf.keras.initializers.serialize(initializer),
        'output_range': output_range,
        'embedding_width': embedding_width,
        'embedding_layer': embedding_layer,
        'norm_first': norm_first,
        'dict_outputs': dict_outputs,
    }
    # pylint: disable=protected-access
    self._setattr_tracking = False
    self._config = config_dict
    self._setattr_tracking = True
예제 #29
0
    def __init__(
            self,
            use_spec_norm_att: bool = False,
            use_spec_norm_ffn: bool = False,
            use_spec_norm_plr: bool = False,
            use_layer_norm_att: bool = True,
            use_layer_norm_ffn: bool = True,
            # A dict of kwargs to pass to the Transformer class.
            hidden_cfg: Optional[Dict[str, Any]] = None,
            **kwargs: Mapping[str, Any]):
        """Initializer."""
        hidden_cls = SpectralNormalizedTransformer

        # Add layer normalization arguments to default transformer config.
        normalization_cfg = {
            'use_layer_norm_att': use_layer_norm_att,
            'use_layer_norm_ffn': use_layer_norm_ffn,
            'use_spec_norm_att': use_spec_norm_att,
            'use_spec_norm_ffn': use_spec_norm_ffn,
        }

        if hidden_cfg:
            hidden_cfg.update(normalization_cfg)
        else:
            hidden_cfg = normalization_cfg

        # Intialize default layers.
        super().__init__(hidden_cls=hidden_cls,
                         hidden_cfg=hidden_cfg,
                         **kwargs)

        # Rebuild BERT model graph using default layers.
        seq_length = self._embedding_cfg.get('seq_length', None)

        # Create inputs layers.
        word_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(seq_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')
        inputs = [word_ids, mask, type_ids]

        # Define Input Embeddings Layers.
        word_embeddings = self._embedding_layer(word_ids)
        position_embeddings = self._position_embedding_layer(word_embeddings)
        type_embeddings = self._type_embedding_layer(type_ids)

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])
        # TODO(jereliu): Add option to disable embedding layer normalization.
        embeddings = self._embedding_norm_layer(embeddings)
        embeddings = (tf.keras.layers.Dropout(
            rate=self._embedding_cfg['dropout_rate'])(embeddings))

        # Define self-attention layers. Rename to match with BERT checkpoint.
        attention_mask = bert_layers.SelfAttentionMask()([embeddings, mask])
        data = embeddings

        layer_output_data = []
        self._hidden_layers = []
        for i in range(self._num_hidden_instances):
            layer = hidden_cls(**self._hidden_cfg,
                               name='transformer/layer_%d' %
                               i)  # Rename to match BERT checkpoint.
            data = layer([data, attention_mask])
            layer_output_data.append(data)
            self._hidden_layers.append(layer)

        # Extract BERT encoder output (i.e., the CLS token).
        first_token_tensor = (
            tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
                layer_output_data[-1]))

        # Define the pooler layer (i.e., the output layer), and optionally apply
        # spectral normalization.
        self._pooler_layer = tf.keras.layers.Dense(
            units=self._pooled_output_dim,
            activation='tanh',
            kernel_initializer=self._pooler_layer_initializer,
            name='pooler_transform')
        if use_spec_norm_plr:
            self._pooler_layer = ed.layers.SpectralNormalization(
                self._pooler_layer,
                inhere_layer_name=True,
                **hidden_cfg['spec_norm_kwargs'])

        cls_output = self._pooler_layer(first_token_tensor)

        if self._return_all_layer_outputs:
            outputs = [layer_output_data, cls_output]
        else:
            outputs = [layer_output_data[-1], cls_output]

        # Compile model with updated graph.
        super(bert_encoder.EncoderScaffold, self).__init__(inputs=inputs,
                                                           outputs=outputs,
                                                           **self._kwargs)
예제 #30
0
              use_one_hot=True,
              name='type_embeddings')(type_ids))

      embeddings = tf.keras.layers.Add()(
          [word_embeddings, position_embeddings, type_embeddings])
      embeddings = (
          tf.keras.layers.LayerNormalization(
              name='embeddings/layer_norm',
              axis=-1,
              epsilon=1e-12,
              dtype=tf.float32)(embeddings))
      embeddings = (
          tf.keras.layers.Dropout(
              rate=embedding_cfg['dropout_rate'])(embeddings))

      attention_mask = layers.SelfAttentionMask()([embeddings, mask])

    data = embeddings

    layer_output_data = []
    self._hidden_layers = []
    for _ in range(num_hidden_instances):
      if inspect.isclass(hidden_cls):
        layer = hidden_cls(**hidden_cfg) if hidden_cfg else hidden_cls()
      else:
        layer = hidden_cls
      data = layer([data, attention_mask])
      layer_output_data.append(data)
      self._hidden_layers.append(layer)

    first_token_tensor = (