Пример #1
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        embedding_width = 768
        dropout_rate = 0.1
        initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02)

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=30522,
            embedding_width=embedding_width,
            initializer=initializer,
            name="word_embeddings",
        )

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=512,
            name="position_embedding",
        )
        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=2,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name="type_embeddings",
        )
        self._add = tf.keras.layers.Add()
        self._layer_norm = tf.keras.layers.LayerNormalization(
            name="embeddings/layer_norm",
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)
        self._dropout = tf.keras.layers.Dropout(rate=dropout_rate)

        self._attention_mask = layers.SelfAttentionMask()
        self._transformer_layers = []
        for i in range(12):
            layer = layers.Transformer(
                num_attention_heads=12,
                intermediate_size=3072,
                intermediate_activation=activations.gelu,
                dropout_rate=dropout_rate,
                attention_dropout_rate=0.1,
                output_range=None,
                kernel_initializer=initializer,
                name="transformer/layer_%d" % i,
            )
            self._transformer_layers.append(layer)

        self._lambda = tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))
        self._pooler_layer = tf.keras.layers.Dense(
            units=embedding_width,
            activation="tanh",
            kernel_initializer=initializer,
            name="pooler_transform",
        )
Пример #2
0
  def __init__(self,
               word_vocab_size,
               word_embed_size,
               type_vocab_size,
               output_embed_size,
               max_sequence_length=512,
               normalization_type='no_norm',
               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
               dropout_rate=0.1):
    """Class initialization.

    Arguments:
      word_vocab_size: Number of words in the vocabulary.
      word_embed_size: Word embedding size.
      type_vocab_size: Number of word types.
      output_embed_size: Embedding size for the final embedding output.
      max_sequence_length: Maximum length of input sequence.
      normalization_type: String. The type of normalization_type, only
        'no_norm' and 'layer_norm' are supported.
      initializer: The initializer to use for the embedding weights and
        linear projection weights.
      dropout_rate: Dropout rate.
    """
    super(MobileBertEmbedding, self).__init__()
    self.word_vocab_size = word_vocab_size
    self.word_embed_size = word_embed_size
    self.type_vocab_size = type_vocab_size
    self.output_embed_size = output_embed_size
    self.max_sequence_length = max_sequence_length
    self.dropout_rate = dropout_rate

    self.word_embedding = layers.OnDeviceEmbedding(
        self.word_vocab_size,
        self.word_embed_size,
        initializer=initializer,
        name='word_embedding')
    self.type_embedding = layers.OnDeviceEmbedding(
        self.type_vocab_size,
        self.output_embed_size,
        use_one_hot=True,
        initializer=initializer,
        name='type_embedding')
    self.pos_embedding = layers.PositionEmbedding(
        use_dynamic_slicing=True,
        max_sequence_length=max_sequence_length,
        initializer=initializer,
        name='position_embedding')
    self.word_embedding_proj = tf.keras.layers.experimental.EinsumDense(
        'abc,cd->abd',
        output_shape=[None, self.output_embed_size],
        kernel_initializer=initializer,
        bias_axes='d',
        name='embedding_projection')
    self.layer_norm = _get_norm_layer(normalization_type, 'embedding_norm')
    self.dropout_layer = tf.keras.layers.Dropout(
        self.dropout_rate,
        name='embedding_dropout')
Пример #3
0
 def build(self, unused_input_shapes):
     """Implements build() for the layer."""
     if self.embedding_lookup is None:
         self.embedding_lookup = layers.OnDeviceEmbedding(
             vocab_size=self.config.vocab_size,
             embedding_width=self.config.hidden_size,
             initializer=tf.keras.initializers.TruncatedNormal(
                 stddev=self.config.initializer_range),
             name="target_embeddings")
     self.embedding_postprocessor = EmbeddingPostprocessor(
         use_type_embeddings=False,
         use_position_embeddings=True,
         max_position_embeddings=self.config.max_position_embeddings,
         dropout_prob=self.config.hidden_dropout_prob,
         initializer=tf.keras.initializers.VarianceScaling(
             scale=self.config.initializer_gain,
             mode="fan_avg",
             distribution="uniform"),
         name="embedding_postprocessor")
     # Decoder can use a different intermediate size.
     self.multi_channel_cross_attention = self.config.get(
         "multi_channel_cross_attention", False)
     self.decoder = TransformerDecoder(
         num_hidden_layers=self.config.num_decoder_layers,
         hidden_size=self.config.hidden_size,
         num_attention_heads=self.config.num_decoder_attn_heads,
         intermediate_size=self.config.decoder_intermediate_size,
         intermediate_activation=self.config.hidden_act,
         hidden_dropout_prob=self.config.hidden_dropout_prob,
         attention_probs_dropout_prob=self.config.
         attention_probs_dropout_prob,
         initializer_range=self.config.initializer_range,
         multi_channel_cross_attention=self.multi_channel_cross_attention,
         name="decoder")
     super(Decoder, self).build(unused_input_shapes)
Пример #4
0
    def test_multi_doc_decoder(self):
        self._config = utils.get_test_params(cls=configs.NHNetConfig)
        seq_length = 10
        num_docs = 5
        encoder_input_ids = tf.keras.layers.Input(shape=(num_docs, seq_length),
                                                  name="encoder_input_ids",
                                                  dtype=tf.int32)
        target_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                           name="target_ids",
                                           dtype=tf.int32)
        encoder_outputs = tf.keras.layers.Input(
            shape=(num_docs, seq_length, self._config.hidden_size),
            name="all_encoder_outputs",
            dtype=tf.float32)
        embedding_lookup = layers.OnDeviceEmbedding(
            vocab_size=self._config.vocab_size,
            embedding_width=self._config.hidden_size,
            initializer=tf.keras.initializers.TruncatedNormal(
                stddev=self._config.initializer_range),
            name="word_embeddings")
        doc_attention_probs = tf.keras.layers.Input(
            shape=(self._config.num_decoder_attn_heads, seq_length, num_docs),
            name="doc_attention_probs",
            dtype=tf.float32)
        cross_attention_bias = decoder.AttentionBias(
            bias_type="multi_cross")(encoder_input_ids)
        self_attention_bias = decoder.AttentionBias(
            bias_type="decoder_self")(target_ids)

        inputs = dict(attention_bias=cross_attention_bias,
                      self_attention_bias=self_attention_bias,
                      target_ids=target_ids,
                      all_encoder_outputs=encoder_outputs,
                      doc_attention_probs=doc_attention_probs)

        decoder_layer = decoder.Decoder(self._config, embedding_lookup)
        outputs = decoder_layer(inputs)
        model_inputs = dict(encoder_input_ids=encoder_input_ids,
                            target_ids=target_ids,
                            all_encoder_outputs=encoder_outputs,
                            doc_attention_probs=doc_attention_probs)
        model = tf.keras.Model(inputs=model_inputs,
                               outputs=outputs,
                               name="test")
        self.assertLen(decoder_layer.trainable_weights, 30)
        # Forward path.
        fake_inputs = {
            "encoder_input_ids":
            np.zeros((2, num_docs, seq_length), dtype=np.int32),
            "target_ids":
            np.zeros((2, seq_length), dtype=np.int32),
            "all_encoder_outputs":
            np.zeros((2, num_docs, seq_length, 16), dtype=np.float32),
            "doc_attention_probs":
            np.zeros(
                (2, self._config.num_decoder_attn_heads, seq_length, num_docs),
                dtype=np.float32)
        }
        output_tensor = model(fake_inputs)
        self.assertEqual(output_tensor.shape, (2, seq_length, 16))
Пример #5
0
    def __init__(self,
                 emb_dim=512,
                 num_layers=3,
                 rnn_cls=tf.keras.layers.GRU,
                 rnn_input_dropout=0.0,
                 rnn_recurrent_dropout=0.0,
                 causal=False,
                 aaemb_init=tf.initializers.TruncatedNormal(stddev=1.0),
                 kernel_init=tf.initializers.GlorotUniform(),
                 recurrent_init=tf.initializers.Orthogonal(),
                 aaemb_scale_factor=None,
                 **kwargs):
        super().__init__(**kwargs)

        self._aaemb_layer = nlp_layers.OnDeviceEmbedding(
            vocab_size=len(self._vocab),
            embedding_width=emb_dim,
            initializer=aaemb_init,
            scale_factor=aaemb_scale_factor,
            name='embeddings/aminoacid')

        self._rnn_layers = []
        for i in range(num_layers):
            layer = rnn_cls(units=self.config.emb_dim,
                            kernel_initializer=kernel_init,
                            recurrent_initializer=recurrent_init,
                            dropout=rnn_input_dropout,
                            recurrent_dropout=rnn_recurrent_dropout,
                            return_sequences=True,
                            name=f'RNN/layer_{i}')
            if not causal:
                layer = tf.keras.layers.Bidirectional(layer,
                                                      name=f'BiRNN/layer_{i}')
            self._rnn_layers.append(layer)
Пример #6
0
 def __init__(self,
              emb_dim=768,
              num_layers=6,
              num_heads=12,
              mlp_dim=3072,
              mlp_act=activations.approximate_gelu,
              output_dropout=0.1,
              attention_dropout=0.1,
              mlp_dropout=0.1,
              norm_first=True,
              norm_input=False,
              norm_output=True,
              causal=False,
              trainable_posemb=False,
              posemb_init=initializers.HarmonicEmbeddings(scale_factor=1e-4,
                                                          max_freq=1.0),
              aaemb_init=tf.initializers.RandomNormal(stddev=1.0),
              kernel_init=tf.initializers.GlorotUniform(),
              aaemb_scale_factor=None,
              max_len=1024,
              **kwargs):
     super().__init__(**kwargs)
     self._causal = causal
     self.posemb_layer = nlp_layers.PositionEmbedding(
         max_length=max_len,
         initializer=posemb_init,
         trainable=trainable_posemb,
         name='embeddings/positional')
     self.aaemb_layer = nlp_layers.OnDeviceEmbedding(
         vocab_size=len(self._vocab),
         embedding_width=emb_dim,
         initializer=aaemb_init,
         scale_factor=aaemb_scale_factor,
         name='embeddings/aminoacid')
     layer_norm_cls = functools.partial(tf.keras.layers.LayerNormalization,
                                        axis=-1,
                                        epsilon=1e-12)
     self._input_norm_layer = (layer_norm_cls(
         name='embeddings/layer_norm') if norm_input else None)
     self._output_norm_layer = (layer_norm_cls(
         name='output/layer_norm') if norm_output else None)
     self._dropout_layer = tf.keras.layers.Dropout(
         rate=output_dropout, name='embeddings/dropout')
     self._attention_mask = nlp_layers.SelfAttentionMask()
     self._transformer_layers = []
     for i in range(num_layers):
         self._transformer_layers.append(
             nlp_layers.TransformerEncoderBlock(
                 num_attention_heads=num_heads,
                 inner_dim=mlp_dim,
                 inner_activation=mlp_act,
                 output_dropout=output_dropout,
                 attention_dropout=attention_dropout,
                 inner_dropout=mlp_dropout,
                 kernel_initializer=kernel_init,
                 norm_first=norm_first,
                 name=f'transformer/layer_{i}'))
    def __init__(self,
                 vocab_size=33708,
                 embedding_width=512,
                 dropout_rate=0.0,
                 padded_decode=False,
                 decode_max_length=None,
                 extra_decode_length=0,
                 beam_size=4,
                 alpha=0.6,
                 encoder_layer=None,
                 decoder_layer=None,
                 eos_id=EOS_ID,
                 **kwargs):
        """Initialize layers to build Transformer model.

    Args:
      vocab_size: Size of vocabulary.
      embedding_width: Size of hidden layer for embedding.
      dropout_rate: Dropout probability.
      padded_decode: Whether to max_sequence_length padding is used. If set
        False, max_sequence_length padding is not used.
      decode_max_length: maximum number of steps to decode a sequence.
      extra_decode_length: Beam search will run extra steps to decode.
      beam_size: Number of beams for beam search
      alpha: The strength of length normalization for beam search.
      encoder_layer: An initialized encoder layer.
      decoder_layer: An initialized decoder layer.
      eos_id: Id of end of sentence token.
      **kwargs: other keyword arguments.
    """
        super().__init__(**kwargs)
        self._vocab_size = vocab_size
        self._embedding_width = embedding_width
        self._dropout_rate = dropout_rate
        self._padded_decode = padded_decode
        self._decode_max_length = decode_max_length
        self._extra_decode_length = extra_decode_length
        self._beam_size = beam_size
        self._alpha = alpha
        self._eos_id = eos_id
        self.embedding_lookup = layers.OnDeviceEmbedding(
            vocab_size=self._vocab_size,
            embedding_width=self._embedding_width,
            initializer=tf.random_normal_initializer(
                mean=0., stddev=self._embedding_width**-0.5),
            scale_factor=self._embedding_width**0.5)
        self.encoder_layer = encoder_layer
        self.decoder_layer = decoder_layer
        self.position_embedding = layers.RelativePositionEmbedding(
            hidden_size=self._embedding_width)
        self.encoder_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
        self.decoder_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
Пример #8
0
 def __init__(self, vocab_size, hidden_size):
   super().__init__()
   self.inputs = [
       tf.keras.layers.Input(
           shape=(None,), dtype=tf.int32, name="input_word_ids"),
       tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
   ]
   self.attention_mask = layers.SelfAttentionMask()
   self.embedding_layer = layers.OnDeviceEmbedding(
       vocab_size=vocab_size,
       embedding_width=hidden_size,
       initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
       name="word_embeddings")
Пример #9
0
    def __init__(self,
                 emb_dim=768,
                 dropout=0.0,
                 use_layer_norm=False,
                 use_positional_embedding=False,
                 position_embed_init=None,
                 train_position_embed=True,
                 aaemb_init=None,
                 aaemb_scale_factor=None,
                 max_len=1024,
                 **kwargs):
        super().__init__(**kwargs)
        if position_embed_init is None:
            position_embed_init = initializers.HarmonicEmbeddings(
                scale_factor=1e-4, max_freq=1.0)
        if aaemb_init is None:
            aaemb_init = tf.initializers.TruncatedNormal(stddev=1.0)

        self._use_layer_norm = use_layer_norm

        if use_positional_embedding:
            self._positional_embedding = nlp_layers.PositionEmbedding(
                max_length=max_len,
                initializer=position_embed_init,
                trainable=train_position_embed,
                name='embeddings/positional')
        else:
            self._positional_embedding = None

        self._aa_embed = nlp_layers.OnDeviceEmbedding(
            vocab_size=len(self._vocab),
            embedding_width=emb_dim,
            initializer=aaemb_init,
            scale_factor=aaemb_scale_factor,
            name='embeddings/aminoacid')

        if use_layer_norm:
            self._layer_norm = tf.keras.layers.LayerNormalization(
                axis=-1, epsilon=1e-12, name='embeddings/layer_norm')
        else:
            self._layer_norm = None

        self._dropout = tf.keras.layers.Dropout(rate=dropout,
                                                name='embeddings/dropout')
Пример #10
0
 def test_bert_decoder(self):
     seq_length = 10
     encoder_input_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                               name="encoder_input_ids",
                                               dtype=tf.int32)
     target_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                        name="target_ids",
                                        dtype=tf.int32)
     encoder_outputs = tf.keras.layers.Input(
         shape=(seq_length, self._config.hidden_size),
         name="all_encoder_outputs",
         dtype=tf.float32)
     embedding_lookup = layers.OnDeviceEmbedding(
         vocab_size=self._config.vocab_size,
         embedding_width=self._config.hidden_size,
         initializer=tf.keras.initializers.TruncatedNormal(
             stddev=self._config.initializer_range),
         name="word_embeddings")
     cross_attention_bias = decoder.AttentionBias(
         bias_type="single_cross")(encoder_input_ids)
     self_attention_bias = decoder.AttentionBias(
         bias_type="decoder_self")(target_ids)
     inputs = dict(attention_bias=cross_attention_bias,
                   self_attention_bias=self_attention_bias,
                   target_ids=target_ids,
                   all_encoder_outputs=encoder_outputs)
     decoder_layer = decoder.Decoder(self._config, embedding_lookup)
     outputs = decoder_layer(inputs)
     model_inputs = dict(encoder_input_ids=encoder_input_ids,
                         target_ids=target_ids,
                         all_encoder_outputs=encoder_outputs)
     model = tf.keras.Model(inputs=model_inputs,
                            outputs=outputs,
                            name="test")
     self.assertLen(decoder_layer.trainable_weights, 30)
     # Forward path.
     fake_inputs = {
         "encoder_input_ids": np.zeros((2, 10), dtype=np.int32),
         "target_ids": np.zeros((2, 10), dtype=np.int32),
         "all_encoder_outputs": np.zeros((2, 10, 16), dtype=np.float32),
     }
     output_tensor = model(fake_inputs)
     self.assertEqual(output_tensor.shape, (2, 10, 16))
Пример #11
0
    def __init__(
            self,
            vocab_size,
            embedding_width=128,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            dict_outputs=False,
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        word_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(None, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size
        embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = position_embedding_layer(word_embeddings)

        type_embeddings = (layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')(type_ids))

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))
        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            embeddings = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')(embeddings)

        data = embeddings
        attention_mask = layers.SelfAttentionMask()(data, mask)
        shared_layer = layers.TransformerEncoderBlock(
            num_attention_heads=num_attention_heads,
            inner_dim=intermediate_size,
            inner_activation=activation,
            output_dropout=dropout_rate,
            attention_dropout=attention_dropout_rate,
            kernel_initializer=initializer,
            name='transformer')
        encoder_outputs = []
        for _ in range(num_layers):
            data = shared_layer([data, attention_mask])
            encoder_outputs.append(data)

        # Applying a tf.slice op (through subscript notation) to a Keras tensor
        # like this will create a SliceOpLambda layer. This is better than a Lambda
        # layer with Python code, because that is fundamentally less portable.
        first_token_tensor = data[:, 0, :]
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)
        if dict_outputs:
            outputs = dict(
                sequence_output=data,
                encoder_outputs=encoder_outputs,
                pooled_output=cls_output,
            )
        else:
            outputs = [data, cls_output]

        # b/164516224
        # Once we've created the network using the Functional API, we call
        # super().__init__ as though we were invoking the Functional API Model
        # constructor, resulting in this object having all the properties of a model
        # created using the Functional API. Once super().__init__ is called, we
        # can assign attributes to `self` - note that all `self` assignments are
        # below this line.
        super(AlbertEncoder, self).__init__(inputs=[word_ids, mask, type_ids],
                                            outputs=outputs,
                                            **kwargs)
        config_dict = {
            'vocab_size': vocab_size,
            'embedding_width': embedding_width,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
        }

        # We are storing the config dict as a namedtuple here to ensure checkpoint
        # compatibility with an earlier version of this model which did not track
        # the config dict attribute. TF does not track immutable attrs which
        # do not contain Trackables, so by creating a config namedtuple instead of
        # a dict we avoid tracking it.
        config_cls = collections.namedtuple('Config', config_dict.keys())
        self._config = config_cls(**config_dict)
        self._embedding_layer = embedding_layer
        self._position_embedding_layer = position_embedding_layer
Пример #12
0
    def __init__(
            self,
            pooled_output_dim,
            pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            embedding_cls=None,
            embedding_cfg=None,
            embedding_data=None,
            num_hidden_instances=1,
            hidden_cls=layers.Transformer,
            hidden_cfg=None,
            mask_cls=layers.SelfAttentionMask,
            mask_cfg=None,
            layer_norm_before_pooling=False,
            return_all_layer_outputs=False,
            dict_outputs=False,
            layer_idx_as_attention_seed=False,
            feed_layer_idx=False,
            recursive=False,
            **kwargs):

        if embedding_cls:
            if inspect.isclass(embedding_cls):
                embedding_network = embedding_cls(
                    **embedding_cfg) if embedding_cfg else embedding_cls()
            else:
                embedding_network = embedding_cls
            inputs = embedding_network.inputs
            embeddings, attention_mask = embedding_network(inputs)
            embedding_layer = None
            position_embedding_layer = None
            type_embedding_layer = None
            embedding_norm_layer = None
        else:
            embedding_network = None
            seq_length = embedding_cfg.get('seq_length', None)
            word_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                             dtype=tf.int32,
                                             name='input_word_ids')
            mask = tf.keras.layers.Input(shape=(seq_length, ),
                                         dtype=tf.int32,
                                         name='input_mask')
            type_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                             dtype=tf.int32,
                                             name='input_type_ids')
            inputs = [word_ids, mask, type_ids]

            embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=embedding_cfg['vocab_size'],
                embedding_width=embedding_cfg['hidden_size'],
                initializer=tf_utils.clone_initializer(
                    embedding_cfg['initializer']),
                name='word_embeddings')

            word_embeddings = embedding_layer(word_ids)

            # Always uses dynamic slicing for simplicity.
            position_embedding_layer = layers.PositionEmbedding(
                initializer=tf_utils.clone_initializer(
                    embedding_cfg['initializer']),
                max_length=embedding_cfg['max_seq_length'],
                name='position_embedding')
            position_embeddings = position_embedding_layer(word_embeddings)

            type_embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=embedding_cfg['type_vocab_size'],
                embedding_width=embedding_cfg['hidden_size'],
                initializer=tf_utils.clone_initializer(
                    embedding_cfg['initializer']),
                use_one_hot=True,
                name='type_embeddings')
            type_embeddings = type_embedding_layer(type_ids)

            embeddings = tf.keras.layers.Add()(
                [word_embeddings, position_embeddings, type_embeddings])

            embedding_norm_layer = tf.keras.layers.LayerNormalization(
                name='embeddings/layer_norm',
                axis=-1,
                epsilon=1e-12,
                dtype=tf.float32)
            embeddings = embedding_norm_layer(embeddings)

            embeddings = (tf.keras.layers.Dropout(
                rate=embedding_cfg['dropout_rate'])(embeddings))

            mask_cfg = {} if mask_cfg is None else mask_cfg
            if inspect.isclass(mask_cls):
                mask_layer = mask_cls(**mask_cfg)
            else:
                mask_layer = mask_cls
            attention_mask = mask_layer(embeddings, mask)

        data = embeddings

        layer_output_data = []
        hidden_layers = []
        hidden_cfg = hidden_cfg if hidden_cfg else {}

        if isinstance(hidden_cls,
                      list) and len(hidden_cls) != num_hidden_instances:
            raise RuntimeError((
                'When input hidden_cls to EncoderScaffold %s is a list, it must '
                'contain classes or instances with size specified by '
                'num_hidden_instances, got %d vs %d.') % self.name,
                               len(hidden_cls), num_hidden_instances)
        # Consider supporting customized init states.
        recursive_states = None
        for i in range(num_hidden_instances):
            if isinstance(hidden_cls, list):
                cur_hidden_cls = hidden_cls[i]
            else:
                cur_hidden_cls = hidden_cls
            if inspect.isclass(cur_hidden_cls):
                if hidden_cfg and 'attention_cfg' in hidden_cfg and (
                        layer_idx_as_attention_seed):
                    hidden_cfg = copy.deepcopy(hidden_cfg)
                    hidden_cfg['attention_cfg']['seed'] = i
                if feed_layer_idx:
                    hidden_cfg['layer_idx'] = i
                layer = cur_hidden_cls(**hidden_cfg)
            else:
                layer = cur_hidden_cls
            if recursive:
                data, recursive_states = layer(
                    [data, attention_mask, recursive_states])
            else:
                data = layer([data, attention_mask])
            layer_output_data.append(data)
            hidden_layers.append(layer)

        if layer_norm_before_pooling:
            # Normalize the final output.
            output_layer_norm = tf.keras.layers.LayerNormalization(
                name='final_layer_norm', axis=-1, epsilon=1e-12)
            layer_output_data[-1] = output_layer_norm(layer_output_data[-1])

        last_layer_output = layer_output_data[-1]
        # Applying a tf.slice op (through subscript notation) to a Keras tensor
        # like this will create a SliceOpLambda layer. This is better than a Lambda
        # layer with Python code, because that is fundamentally less portable.
        first_token_tensor = last_layer_output[:, 0, :]
        pooler_layer_initializer = tf.keras.initializers.get(
            pooler_layer_initializer)
        pooler_layer = tf.keras.layers.Dense(
            units=pooled_output_dim,
            activation='tanh',
            kernel_initializer=pooler_layer_initializer,
            name='cls_transform')
        cls_output = pooler_layer(first_token_tensor)

        if dict_outputs:
            outputs = dict(
                sequence_output=layer_output_data[-1],
                pooled_output=cls_output,
                encoder_outputs=layer_output_data,
            )
        elif return_all_layer_outputs:
            outputs = [layer_output_data, cls_output]
        else:
            outputs = [layer_output_data[-1], cls_output]

        # b/164516224
        # Once we've created the network using the Functional API, we call
        # super().__init__ as though we were invoking the Functional API Model
        # constructor, resulting in this object having all the properties of a model
        # created using the Functional API. Once super().__init__ is called, we
        # can assign attributes to `self` - note that all `self` assignments are
        # below this line.
        super().__init__(inputs=inputs, outputs=outputs, **kwargs)

        self._hidden_cls = hidden_cls
        self._hidden_cfg = hidden_cfg
        self._mask_cls = mask_cls
        self._mask_cfg = mask_cfg
        self._num_hidden_instances = num_hidden_instances
        self._pooled_output_dim = pooled_output_dim
        self._pooler_layer_initializer = pooler_layer_initializer
        self._embedding_cls = embedding_cls
        self._embedding_cfg = embedding_cfg
        self._embedding_data = embedding_data
        self._layer_norm_before_pooling = layer_norm_before_pooling
        self._return_all_layer_outputs = return_all_layer_outputs
        self._dict_outputs = dict_outputs
        self._kwargs = kwargs

        self._embedding_layer = embedding_layer
        self._embedding_network = embedding_network
        self._position_embedding_layer = position_embedding_layer
        self._type_embedding_layer = type_embedding_layer
        self._embedding_norm_layer = embedding_norm_layer
        self._hidden_layers = hidden_layers
        if self._layer_norm_before_pooling:
            self._output_layer_norm = output_layer_norm
        self._pooler_layer = pooler_layer
        self._layer_idx_as_attention_seed = layer_idx_as_attention_seed

        logging.info('EncoderScaffold configs: %s', self.get_config())
Пример #13
0
    def __init__(
            self,
            vocab_size: int,
            hidden_size: int = 768,
            num_layers: int = 12,
            num_attention_heads: int = 12,
            max_sequence_length: int = 512,
            type_vocab_size: int = 16,
            inner_dim: int = 3072,
            inner_activation: _Activation = _approx_gelu,
            output_dropout: float = 0.1,
            attention_dropout: float = 0.1,
            pool_type: str = _MAX,
            pool_stride: int = 2,
            unpool_length: int = 0,
            initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            output_range: Optional[int] = None,
            embedding_width: Optional[int] = None,
            embedding_layer: Optional[tf.keras.layers.Layer] = None,
            norm_first: bool = False,
            transformer_cls: Union[
                str, tf.keras.layers.Layer] = layers.TransformerEncoderBlock,
            share_rezero: bool = True,
            **kwargs):
        super().__init__(**kwargs)
        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=tf_utils.clone_initializer(initializer),
                name='word_embeddings')
        else:
            self._embedding_layer = embedding_layer

        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=tf_utils.clone_initializer(initializer),
            max_length=max_sequence_length,
            name='position_embedding')

        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=tf_utils.clone_initializer(initializer),
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=output_dropout, name='embedding_dropout')

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        self._embedding_projection = None
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=tf_utils.clone_initializer(initializer),
                name='embedding_projection')

        self._transformer_layers = []
        self._attention_mask_layer = layers.SelfAttentionMask(
            name='self_attention_mask')
        # Will raise an error if the string is not supported.
        if isinstance(transformer_cls, str):
            transformer_cls = _str2transformer_cls[transformer_cls]
        for i in range(num_layers):
            layer = transformer_cls(
                num_attention_heads=num_attention_heads,
                intermediate_size=inner_dim,
                inner_dim=inner_dim,
                intermediate_activation=inner_activation,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=output_range if i == num_layers - 1 else None,
                kernel_initializer=tf_utils.clone_initializer(initializer),
                share_rezero=share_rezero,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=tf_utils.clone_initializer(initializer),
            name='pooler_transform')
        if isinstance(pool_stride, int):
            # TODO(b/197133196): Pooling layer can be shared.
            pool_strides = [pool_stride] * num_layers
        else:
            if len(pool_stride) != num_layers:
                raise ValueError(
                    'Lengths of pool_stride and num_layers are not equal.')
            pool_strides = pool_stride
        # TODO(crickwu): explore tf.keras.layers.serialize method.
        if pool_type == _MAX:
            pool_cls = tf.keras.layers.MaxPooling1D
        elif pool_type == _AVG:
            pool_cls = tf.keras.layers.AveragePooling1D
        elif pool_type == _TRUNCATED_AVG:
            # TODO(b/203665205): unpool_length should be implemented.
            if unpool_length != 0:
                raise ValueError(
                    'unpool_length is not supported by truncated_avg now.')
        else:
            raise ValueError('pool_type not supported.')

        if pool_type in (_MAX, _AVG):
            self._att_input_pool_layers = []
            for layer_pool_stride in pool_strides:
                att_input_pool_layer = pool_cls(pool_size=layer_pool_stride,
                                                strides=layer_pool_stride,
                                                padding='same',
                                                name='att_input_pool_layer')
                self._att_input_pool_layers.append(att_input_pool_layer)

        self._max_sequence_length = max_sequence_length
        self._pool_strides = pool_strides  # This is a list here.
        self._unpool_length = unpool_length
        self._pool_type = pool_type

        self._config = {
            'vocab_size':
            vocab_size,
            'hidden_size':
            hidden_size,
            'num_layers':
            num_layers,
            'num_attention_heads':
            num_attention_heads,
            'max_sequence_length':
            max_sequence_length,
            'type_vocab_size':
            type_vocab_size,
            'inner_dim':
            inner_dim,
            'inner_activation':
            tf.keras.activations.serialize(activation),
            'output_dropout':
            output_dropout,
            'attention_dropout':
            attention_dropout,
            'initializer':
            tf.keras.initializers.serialize(initializer),
            'output_range':
            output_range,
            'embedding_width':
            embedding_width,
            'embedding_layer':
            embedding_layer,
            'norm_first':
            norm_first,
            'pool_type':
            pool_type,
            'pool_stride':
            pool_stride,
            'unpool_length':
            unpool_length,
            'transformer_cls':
            _transformer_cls2str.get(transformer_cls, str(transformer_cls))
        }

        self.inputs = dict(input_word_ids=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32),
                           input_mask=tf.keras.Input(shape=(None, ),
                                                     dtype=tf.int32),
                           input_type_ids=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32))
Пример #14
0
  def __init__(self,
               vocab_size,
               num_layers,
               hidden_size,
               num_attention_heads,
               head_size,
               inner_size,
               dropout_rate,
               attention_dropout_rate,
               attention_type,
               bi_data,
               initializer,
               two_stream=False,
               tie_attention_biases=True,
               memory_length=None,
               clamp_length=-1,
               reuse_length=None,
               inner_activation="relu",
               use_cls_mask=False,
               embedding_width=None,
               **kwargs):
    super(XLNetBase, self).__init__(**kwargs)

    self._vocab_size = vocab_size
    self._initializer = initializer
    self._attention_type = attention_type
    self._num_layers = num_layers
    self._hidden_size = hidden_size
    self._num_attention_heads = num_attention_heads
    self._head_size = head_size
    self._inner_size = inner_size
    self._inner_activation = inner_activation
    self._dropout_rate = dropout_rate
    self._attention_dropout_rate = attention_dropout_rate
    self._tie_attention_biases = tie_attention_biases
    self._two_stream = two_stream

    self._memory_length = memory_length
    self._reuse_length = reuse_length
    self._bi_data = bi_data
    self._clamp_length = clamp_length
    self._use_cls_mask = use_cls_mask

    self._segment_embedding = None
    self._mask_embedding = None
    self._embedding_width = embedding_width

    if embedding_width is None:
      embedding_width = hidden_size

    self._embedding_layer = layers.OnDeviceEmbedding(
        vocab_size=self._vocab_size,
        embedding_width=embedding_width,
        initializer=self._initializer,
        dtype=tf.float32,
        name="word_embedding")
    self._dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)

    self.embedding_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    self.position_encoding = RelativePositionEncoding(self._hidden_size)

    self._transformer_xl = transformer_xl.TransformerXL(
        vocab_size=vocab_size,
        num_layers=num_layers,
        hidden_size=hidden_size,
        num_attention_heads=num_attention_heads,
        head_size=head_size,
        inner_size=inner_size,
        dropout_rate=dropout_rate,
        attention_dropout_rate=attention_dropout_rate,
        initializer=initializer,
        two_stream=two_stream,
        tie_attention_biases=tie_attention_biases,
        memory_length=memory_length,
        reuse_length=reuse_length,
        inner_activation=inner_activation,
        name="transformer_xl")
Пример #15
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_sizes=(16, ),
            num_float_features=0,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            return_all_encoder_outputs=False,
            bert_init_ckpt=None,
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self._self_setattr_tracking = False
        num_type_features = len(type_vocab_sizes)
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_sizes': type_vocab_sizes,
            'num_type_features': num_type_features,
            'num_float_features': num_float_features,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'return_all_encoder_outputs': return_all_encoder_outputs,
        }

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        all_inputs = [word_ids, mask]
        if num_type_features:
            type_ids = tf.keras.layers.Input(shape=(sequence_length,
                                                    num_type_features),
                                             dtype=tf.int32,
                                             name='input_type_ids')
            all_inputs.append(type_ids)
        if num_float_features:
            float_features = tf.keras.layers.Input(shape=(sequence_length,
                                                          num_float_features),
                                                   dtype=tf.float32,
                                                   name='float_features')
            all_inputs.append(float_features)

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = modeling.layers.PositionEmbedding(
            initializer=initializer, max_length=max_sequence_length)
        position_embeddings = self._position_embedding_layer(word_embeddings)
        all_embeddings = [word_embeddings, position_embeddings]

        if num_type_features:
            type_embeddings = [(layers.OnDeviceEmbedding(
                vocab_size=type_vocab_sizes[idx],
                embedding_width=hidden_size,
                initializer=initializer,
                use_one_hot=True,
                name='type_embeddings_{}'.format(idx))(type_ids[..., idx]))
                               for idx in range(num_type_features)]
            all_embeddings += type_embeddings

        if num_float_features:
            float_embeddings = [
                (
                    tf.keras.layers.Dense(
                        hidden_size, name='float_features_{}'.format(idx))(
                            # Expanding the last dim here is important.
                            float_features[..., idx, None]))
                for idx in range(num_float_features)
            ]
            all_embeddings += float_embeddings

        embeddings = tf.keras.layers.Add()(all_embeddings)
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))

        self._transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()([data, mask])
        encoder_outputs = []
        for i in range(num_layers):
            layer = layers.Transformer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                kernel_initializer=initializer,
                name='model/layer_with_weights-%d' % (i + 4))
            self._transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        first_token_tensor = (
            tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
                encoder_outputs[-1]))
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)

        if return_all_encoder_outputs:
            outputs = [encoder_outputs, cls_output]
        else:
            outputs = [encoder_outputs[-1], cls_output]
        super(TransformerEncoder, self).__init__(inputs=all_inputs,
                                                 outputs=outputs,
                                                 **kwargs)

        if bert_init_ckpt and learner_flags.INIT_CHECKPOINT.value is None:
            self.init_weights(bert_init_ckpt)
    def __init__(
            self,
            vocab_size,
            embedding_width=128,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'embedding_width': embedding_width,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
        }

        word_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(None, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(None, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size
        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = keras_nlp.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = self._position_embedding_layer(word_embeddings)

        type_embeddings = (layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')(type_ids))

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))
        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            embeddings = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')(embeddings)

        data = embeddings
        attention_mask = layers.SelfAttentionMask()([data, mask])
        shared_layer = keras_nlp.TransformerEncoderBlock(
            num_attention_heads=num_attention_heads,
            inner_dim=intermediate_size,
            inner_activation=activation,
            output_dropout=dropout_rate,
            attention_dropout=attention_dropout_rate,
            kernel_initializer=initializer,
            name='transformer')
        for _ in range(num_layers):
            data = shared_layer([data, attention_mask])

        first_token_tensor = (tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data))
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)

        super(AlbertTransformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=[data, cls_output],
                             **kwargs)
Пример #17
0
  def __init__(
      self,
      vocab_size,
      hidden_size=768,
      num_layers=12,
      num_attention_heads=12,
      max_sequence_length=512,
      type_vocab_size=16,
      inner_dim=3072,
      inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
      output_dropout=0.1,
      attention_dropout=0.1,
      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
      output_range=None,
      embedding_width=None,
      embedding_layer=None,
      norm_first=False,
      dict_outputs=False,
      return_all_encoder_outputs=False,
      **kwargs):
    if 'sequence_length' in kwargs:
      kwargs.pop('sequence_length')
      logging.warning('`sequence_length` is a deprecated argument to '
                      '`BertEncoder`, which has no effect for a while. Please '
                      'remove `sequence_length` argument.')

    # Handles backward compatible kwargs.
    if 'intermediate_size' in kwargs:
      inner_dim = kwargs.pop('intermediate_size')

    if 'activation' in kwargs:
      inner_activation = kwargs.pop('activation')

    if 'dropout_rate' in kwargs:
      output_dropout = kwargs.pop('dropout_rate')

    if 'attention_dropout_rate' in kwargs:
      attention_dropout = kwargs.pop('attention_dropout_rate')

    activation = tf.keras.activations.get(inner_activation)
    initializer = tf.keras.initializers.get(initializer)

    word_ids = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_word_ids')
    mask = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_mask')
    type_ids = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_type_ids')

    if embedding_width is None:
      embedding_width = hidden_size

    if embedding_layer is None:
      embedding_layer_inst = layers.OnDeviceEmbedding(
          vocab_size=vocab_size,
          embedding_width=embedding_width,
          initializer=initializer,
          name='word_embeddings')
    else:
      embedding_layer_inst = embedding_layer
    word_embeddings = embedding_layer_inst(word_ids)

    # Always uses dynamic slicing for simplicity.
    position_embedding_layer = layers.PositionEmbedding(
        initializer=initializer,
        max_length=max_sequence_length,
        name='position_embedding')
    position_embeddings = position_embedding_layer(word_embeddings)
    type_embedding_layer = layers.OnDeviceEmbedding(
        vocab_size=type_vocab_size,
        embedding_width=embedding_width,
        initializer=initializer,
        use_one_hot=True,
        name='type_embeddings')
    type_embeddings = type_embedding_layer(type_ids)

    embeddings = tf.keras.layers.Add()(
        [word_embeddings, position_embeddings, type_embeddings])

    embedding_norm_layer = tf.keras.layers.LayerNormalization(
        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)

    embeddings = embedding_norm_layer(embeddings)
    embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))

    # We project the 'embedding' output to 'hidden_size' if it is not already
    # 'hidden_size'.
    if embedding_width != hidden_size:
      embedding_projection = tf.keras.layers.experimental.EinsumDense(
          '...x,xy->...y',
          output_shape=hidden_size,
          bias_axes='y',
          kernel_initializer=initializer,
          name='embedding_projection')
      embeddings = embedding_projection(embeddings)
    else:
      embedding_projection = None

    transformer_layers = []
    data = embeddings
    attention_mask = layers.SelfAttentionMask()(data, mask)
    encoder_outputs = []
    for i in range(num_layers):
      if i == num_layers - 1 and output_range is not None:
        transformer_output_range = output_range
      else:
        transformer_output_range = None
      layer = layers.TransformerEncoderBlock(
          num_attention_heads=num_attention_heads,
          inner_dim=inner_dim,
          inner_activation=inner_activation,
          output_dropout=output_dropout,
          attention_dropout=attention_dropout,
          norm_first=norm_first,
          output_range=transformer_output_range,
          kernel_initializer=initializer,
          name='transformer/layer_%d' % i)
      transformer_layers.append(layer)
      data = layer([data, attention_mask])
      encoder_outputs.append(data)

    last_encoder_output = encoder_outputs[-1]
    # Applying a tf.slice op (through subscript notation) to a Keras tensor
    # like this will create a SliceOpLambda layer. This is better than a Lambda
    # layer with Python code, because that is fundamentally less portable.
    first_token_tensor = last_encoder_output[:, 0, :]
    pooler_layer = tf.keras.layers.Dense(
        units=hidden_size,
        activation='tanh',
        kernel_initializer=initializer,
        name='pooler_transform')
    cls_output = pooler_layer(first_token_tensor)

    outputs = dict(
        sequence_output=encoder_outputs[-1],
        pooled_output=cls_output,
        encoder_outputs=encoder_outputs,
    )

    if dict_outputs:
      super().__init__(
          inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
    else:
      cls_output = outputs['pooled_output']
      if return_all_encoder_outputs:
        encoder_outputs = outputs['encoder_outputs']
        outputs = [encoder_outputs, cls_output]
      else:
        sequence_output = outputs['sequence_output']
        outputs = [sequence_output, cls_output]
      super().__init__(  # pylint: disable=bad-super-call
          inputs=[word_ids, mask, type_ids],
          outputs=outputs,
          **kwargs)

    self._pooler_layer = pooler_layer
    self._transformer_layers = transformer_layers
    self._embedding_norm_layer = embedding_norm_layer
    self._embedding_layer = embedding_layer_inst
    self._position_embedding_layer = position_embedding_layer
    self._type_embedding_layer = type_embedding_layer
    if embedding_projection is not None:
      self._embedding_projection = embedding_projection

    config_dict = {
        'vocab_size': vocab_size,
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'num_attention_heads': num_attention_heads,
        'max_sequence_length': max_sequence_length,
        'type_vocab_size': type_vocab_size,
        'inner_dim': inner_dim,
        'inner_activation': tf.keras.activations.serialize(activation),
        'output_dropout': output_dropout,
        'attention_dropout': attention_dropout,
        'initializer': tf.keras.initializers.serialize(initializer),
        'output_range': output_range,
        'embedding_width': embedding_width,
        'embedding_layer': embedding_layer,
        'norm_first': norm_first,
        'dict_outputs': dict_outputs,
    }
    # pylint: disable=protected-access
    self._setattr_tracking = False
    self._config = config_dict
    self._setattr_tracking = True
Пример #18
0
    def __init__(
            self,
            pooled_output_dim,
            pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            embedding_cls=None,
            embedding_cfg=None,
            embedding_data=None,
            num_hidden_instances=1,
            hidden_cls=layers.Transformer,
            hidden_cfg=None,
            return_all_layer_outputs=False,
            dict_outputs=False,
            **kwargs):
        self._self_setattr_tracking = False
        self._hidden_cls = hidden_cls
        self._hidden_cfg = hidden_cfg
        self._num_hidden_instances = num_hidden_instances
        self._pooled_output_dim = pooled_output_dim
        self._pooler_layer_initializer = pooler_layer_initializer
        self._embedding_cls = embedding_cls
        self._embedding_cfg = embedding_cfg
        self._embedding_data = embedding_data
        self._return_all_layer_outputs = return_all_layer_outputs
        self._dict_outputs = dict_outputs
        self._kwargs = kwargs

        if embedding_cls:
            if inspect.isclass(embedding_cls):
                self._embedding_network = embedding_cls(
                    **embedding_cfg) if embedding_cfg else embedding_cls()
            else:
                self._embedding_network = embedding_cls
            inputs = self._embedding_network.inputs
            embeddings, attention_mask = self._embedding_network(inputs)
        else:
            self._embedding_network = None
            seq_length = embedding_cfg.get('seq_length', None)
            word_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                             dtype=tf.int32,
                                             name='input_word_ids')
            mask = tf.keras.layers.Input(shape=(seq_length, ),
                                         dtype=tf.int32,
                                         name='input_mask')
            type_ids = tf.keras.layers.Input(shape=(seq_length, ),
                                             dtype=tf.int32,
                                             name='input_type_ids')
            inputs = [word_ids, mask, type_ids]

            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=embedding_cfg['vocab_size'],
                embedding_width=embedding_cfg['hidden_size'],
                initializer=embedding_cfg['initializer'],
                name='word_embeddings')

            word_embeddings = self._embedding_layer(word_ids)

            # Always uses dynamic slicing for simplicity.
            self._position_embedding_layer = keras_nlp.PositionEmbedding(
                initializer=embedding_cfg['initializer'],
                max_length=embedding_cfg['max_seq_length'],
                name='position_embedding')
            position_embeddings = self._position_embedding_layer(
                word_embeddings)

            self._type_embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=embedding_cfg['type_vocab_size'],
                embedding_width=embedding_cfg['hidden_size'],
                initializer=embedding_cfg['initializer'],
                use_one_hot=True,
                name='type_embeddings')
            type_embeddings = self._type_embedding_layer(type_ids)

            embeddings = tf.keras.layers.Add()(
                [word_embeddings, position_embeddings, type_embeddings])

            self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
                name='embeddings/layer_norm',
                axis=-1,
                epsilon=1e-12,
                dtype=tf.float32)
            embeddings = self._embedding_norm_layer(embeddings)

            embeddings = (tf.keras.layers.Dropout(
                rate=embedding_cfg['dropout_rate'])(embeddings))

            attention_mask = layers.SelfAttentionMask()([embeddings, mask])

        data = embeddings

        layer_output_data = []
        self._hidden_layers = []
        for _ in range(num_hidden_instances):
            if inspect.isclass(hidden_cls):
                layer = hidden_cls(
                    **hidden_cfg) if hidden_cfg else hidden_cls()
            else:
                layer = hidden_cls
            data = layer([data, attention_mask])
            layer_output_data.append(data)
            self._hidden_layers.append(layer)

        first_token_tensor = (
            tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
                layer_output_data[-1]))
        self._pooler_layer = tf.keras.layers.Dense(
            units=pooled_output_dim,
            activation='tanh',
            kernel_initializer=pooler_layer_initializer,
            name='cls_transform')
        cls_output = self._pooler_layer(first_token_tensor)

        if dict_outputs:
            outputs = dict(
                sequence_output=layer_output_data[-1],
                pooled_output=cls_output,
                encoder_outputs=layer_output_data,
            )
        elif return_all_layer_outputs:
            outputs = [layer_output_data, cls_output]
        else:
            outputs = [layer_output_data[-1], cls_output]

        super(EncoderScaffold, self).__init__(inputs=inputs,
                                              outputs=outputs,
                                              **kwargs)

        logging.info('EncoderScaffold configs: %s', self.get_config())
Пример #19
0
  def __init__(self,
               vocab_size=33708,
               hidden_size=512,
               dropout_rate=0.0,
               padded_decode=False,
               num_replicas=1,
               decode_batch_size=2048,
               decode_max_length=97,
               dtype=tf.float32,
               extra_decode_length=0,
               num_heads=8,
               num_layers=6,
               beam_size=4,
               alpha=0.6,
               encoder_layer=None,
               decoder_layer=None,
               name=None,
               **kwargs):
    """Initialize layers to build Transformer model.

    Arguments:
      vocab_size: Size of vocabulary.
      hidden_size: Size of hidden layer for embedding.
      dropout_rate: Dropout probability.
      padded_decode: Whether to max_sequence_length padding is used. If set
        False, max_sequence_length padding is not used.
      num_replicas: Number of replicas for distribution strategy.
      decode_batch_size: batch_size for decoding.
      decode_max_length: maximum number of steps to decode a sequence.
      dtype: data type.
      extra_decode_length: Beam search will run extra steps to decode.
      num_heads: Number of attention heads.
      num_layers: Number of identical layers for Transformer architecture.
      beam_size: Number of beams for beam search
      alpha: The strength of length normalization for beam search.
      encoder_layer: An initialized encoder layer.
      decoder_layer: An initialized decoder layer.
      name: name of the model.
      **kwargs: other keyword arguments.
    """
    super(Seq2SeqTransformer, self).__init__(**kwargs)
    self._vocab_size = vocab_size
    self._hidden_size = hidden_size
    self._dropout_rate = dropout_rate
    self._padded_decode = padded_decode
    self._num_replicas = num_replicas
    self._decode_batch_size = decode_batch_size
    self._decode_max_length = decode_max_length
    self._dtype = dtype
    self._extra_decode_length = extra_decode_length
    self._num_heads = num_heads
    self._num_layers = num_layers
    self._beam_size = beam_size
    self._alpha = alpha
    self.embedding_lookup = layers.OnDeviceEmbedding(
        vocab_size=self._vocab_size,
        embedding_width=self._hidden_size,
        initializer=tf.random_normal_initializer(
            mean=0., stddev=self._hidden_size**-0.5),
        use_scale=True)
    self.encoder_layer = encoder_layer
    self.decoder_layer = decoder_layer
    self.position_embedding = layers.RelativePositionEmbedding(
        hidden_size=self._hidden_size)
    self.encoder_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
    self.decoder_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
Пример #20
0
  def test_serialize_deserialize(self):
    hidden_size = 32
    sequence_length = 21
    vocab_size = 57

    # Build an embedding network to swap in for the default network. This one
    # will have 2 inputs (mask and word_ids) instead of 3, and won't use
    # positional embeddings.

    word_ids = tf.keras.layers.Input(
        shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
    mask = tf.keras.layers.Input(
        shape=(sequence_length,), dtype=tf.int32, name="input_mask")
    embedding_layer = layers.OnDeviceEmbedding(
        vocab_size=vocab_size,
        embedding_width=hidden_size,
        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
        name="word_embeddings")
    word_embeddings = embedding_layer(word_ids)
    attention_mask = layers.SelfAttentionMask()([word_embeddings, mask])
    network = tf.keras.Model([word_ids, mask],
                             [word_embeddings, attention_mask])

    hidden_cfg = {
        "num_attention_heads":
            2,
        "intermediate_size":
            3072,
        "intermediate_activation":
            activations.gelu,
        "dropout_rate":
            0.1,
        "attention_dropout_rate":
            0.1,
        "kernel_initializer":
            tf.keras.initializers.TruncatedNormal(stddev=0.02),
    }

    # Create a small EncoderScaffold for testing.
    test_network = encoder_scaffold.EncoderScaffold(
        num_hidden_instances=3,
        pooled_output_dim=hidden_size,
        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=0.02),
        hidden_cfg=hidden_cfg,
        embedding_cls=network,
        embedding_data=embedding_layer.embeddings)

    # Create another network object from the first object's config.
    new_network = encoder_scaffold.EncoderScaffold.from_config(
        test_network.get_config())

    # Validate that the config can be forced to JSON.
    _ = new_network.to_json()

    # If the serialization was successful, the new config should match the old.
    self.assertAllEqual(test_network.get_config(), new_network.get_config())

    # Create a model based off of the old and new networks:
    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)

    data, pooled = new_network([word_ids, mask])
    new_model = tf.keras.Model([word_ids, mask], [data, pooled])

    data, pooled = test_network([word_ids, mask])
    model = tf.keras.Model([word_ids, mask], [data, pooled])

    # Copy the weights between models.
    new_model.set_weights(model.get_weights())

    # Invoke the models.
    batch_size = 3
    word_id_data = np.random.randint(
        vocab_size, size=(batch_size, sequence_length))
    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
    data, cls = model.predict([word_id_data, mask_data])
    new_data, new_cls = new_model.predict([word_id_data, mask_data])

    # The output should be equal.
    self.assertAllEqual(data, new_data)
    self.assertAllEqual(cls, new_cls)

    # We should not be able to get a reference to the embedding data.
    with self.assertRaisesRegex(RuntimeError, ".*does not have a reference.*"):
      new_network.get_embedding_table()
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            return_all_encoder_outputs=False,
            output_range=None,
            embedding_width=None,
            net2net_ratio=None,
            net2net_layers=None,
            lightatt_layers=None,
            input_pool_name=None,
            input_pool_size=None,
            **kwargs):
        """Bi-directional Transformer-based encoder network.

    This network implements a bi-directional Transformer-based encoder as
    described in "BERT: Pre-training of Deep Bidirectional Transformers for
    Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
    embedding lookups and transformer layers, but not the masked language model
    or classification task networks.

    The default values for this object are taken from the BERT-Base
    implementation
    in "BERT: Pre-training of Deep Bidirectional Transformers for Language
    Understanding".

    Arguments:
      vocab_size: The size of the token vocabulary.
      hidden_size: The size of the transformer hidden layers.
      num_layers: The number of transformer layers.
      num_attention_heads: The number of attention heads for each transformer.
        The hidden size must be divisible by the number of attention heads.
      sequence_length: The sequence length that this encoder expects. If None,
        the sequence length is dynamic; if an integer, the encoder will require
        sequences padded to this length.
      max_sequence_length: The maximum sequence length that this encoder can
        consume. If None, max_sequence_length uses the value from sequence
        length. This determines the variable shape for positional embeddings.
      type_vocab_size: The number of types that the 'type_ids' input can take.
      intermediate_size: The intermediate size for the transformer layers.
      activation: The activation to use for the transformer layers.
      dropout_rate: The dropout rate to use for the transformer layers.
      attention_dropout_rate: The dropout rate to use for the attention layers
        within the transformer layers.
      initializer: The initialzer to use for all weights in this encoder.
      return_all_encoder_outputs: Whether to output sequence embedding outputs
        of all encoder transformer layers.
      output_range: the sequence output range, [0, output_range), by slicing the
        target sequence of the last transformer layer. `None` means the entire
        target sequence will attend to the source sequence, which yeilds the
        full output.
      embedding_width: The width of the word embeddings. If the embedding width
        is not equal to hidden size, embedding parameters will be factorized
        into two matrices in the shape of ['vocab_size', 'embedding_width'] and
        ['embedding_width', 'hidden_size'] ('embedding_width' is usually much
        smaller than 'hidden_size').
       net2net_ratio: net2net ratio for the small fully connected matrices.
       net2net_layers: number of layers with net2net treatment.
       lightatt_layers: number of layers with light attention,
       input_pool_name: input_pool_name,
       input_pool_size: input_pool_size,
       **kwargs: **kwargs
    """
        super(TransformerEncoder, self).__init__()

        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self.net2net_ratio = net2net_ratio
        self.net2net_layers = net2net_layers
        self.lightatt_layers = lightatt_layers
        self.input_pool_name = input_pool_name
        self.input_pool_size = input_pool_size

        if embedding_width is None:
            embedding_width = hidden_size
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'return_all_encoder_outputs': return_all_encoder_outputs,
            'output_range': output_range,
            'embedding_width': embedding_width,
            'net2net_ratio': net2net_ratio,
            'net2net_layers': net2net_layers,
            'lightatt_layers': lightatt_layers,
            'input_pool_name': input_pool_name,
            'input_pool_size': input_pool_size,
        }

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            name='word_embeddings')
        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = position_embedding.PositionEmbedding(
            embed_dim=hidden_size,
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=max_sequence_length,
            name='position_embedding')
        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._dropout_layer = tf.keras.layers.Dropout(rate=dropout_rate)

        self._embedding_projection_layer = tf.keras.layers.experimental.EinsumDense(
            '...x,xy->...y',
            output_shape=hidden_size,
            bias_axes='y',
            kernel_initializer=initializer,
            name='embedding_projection')

        self._self_attention_mask_layer = layers.SelfAttentionMask()

        self._transformer_layers = []
        print('!!!! building transformer layers !!!')
        logging.info('!!!! building transformer layers !!!')
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None

            group_size = num_layers // net2net_layers if net2net_layers is not None else None
            layer_net2net_ratio = None if (
                net2net_layers is None
                or i % group_size != 0) else net2net_ratio

            group_size = num_layers // lightatt_layers if lightatt_layers is not None else None
            use_lightatt = False if (lightatt_layers is None
                                     or i % group_size !=
                                     (group_size - 1)) else True

            logging.info(i)
            logging.info(layer_net2net_ratio)
            logging.info(use_lightatt)
            layer = transformer_layer.TransformerLayer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i,
                use_lightatt=use_lightatt,
                net2net_ratio=layer_net2net_ratio)
            self._transformer_layers.append(layer)
        print('!!!! finish building transformer layers !!!')
        logging.info('!!!! finish building transformer layers !!!')

        self._squeeze_layer = tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')

        nocls = input_pool_name != 'concat'
        input_pool_size = 1 if input_pool_name is None else input_pool_size
        self._mask_resolution_layer = resolution_layer.MaskPoolLayer(
            input_pool_size, nocls=nocls, name='mask_resolution')
        self._embed_resolution_layer = resolution_layer.EmbedPoolLayer(
            hidden_size,
            input_pool_size,
            input_pool_name,
            name='embed_resolution')
Пример #22
0
    def __init__(
            self,
            vocab_size: int,
            hidden_size: int = 768,
            num_layers: int = 12,
            num_attention_heads: int = 12,
            max_sequence_length: int = 512,
            type_vocab_size: int = 16,
            inner_dim: int = 3072,
            inner_activation: _Activation = _approx_gelu,
            output_dropout: float = 0.1,
            attention_dropout: float = 0.1,
            initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            output_range: Optional[int] = None,
            embedding_width: Optional[int] = None,
            embedding_layer: Optional[tf.keras.layers.Layer] = None,
            norm_first: bool = False,
            with_dense_inputs: bool = False,
            **kwargs):
        # Pops kwargs that are used in V1 implementation.
        if 'dict_outputs' in kwargs:
            kwargs.pop('dict_outputs')
        if 'return_all_encoder_outputs' in kwargs:
            kwargs.pop('return_all_encoder_outputs')
        if 'intermediate_size' in kwargs:
            inner_dim = kwargs.pop('intermediate_size')
        if 'activation' in kwargs:
            inner_activation = kwargs.pop('activation')
        if 'dropout_rate' in kwargs:
            output_dropout = kwargs.pop('dropout_rate')
        if 'attention_dropout_rate' in kwargs:
            attention_dropout = kwargs.pop('attention_dropout_rate')
        super().__init__(**kwargs)

        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=initializer,
                name='word_embeddings')
        else:
            self._embedding_layer = embedding_layer

        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')

        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=output_dropout, name='embedding_dropout')

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        self._embedding_projection = None
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')

        self._transformer_layers = []
        self._attention_mask_layer = layers.SelfAttentionMask(
            name='self_attention_mask')
        for i in range(num_layers):
            layer = layers.TransformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=output_range if i == num_layers - 1 else None,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')

        self._config = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
            'norm_first': norm_first,
            'with_dense_inputs': with_dense_inputs,
        }
        if with_dense_inputs:
            self.inputs = dict(
                input_word_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                input_mask=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                input_type_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                dense_inputs=tf.keras.Input(shape=(None, embedding_width),
                                            dtype=tf.float32),
                dense_mask=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                dense_type_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
            )
        else:
            self.inputs = dict(input_word_ids=tf.keras.Input(shape=(None, ),
                                                             dtype=tf.int32),
                               input_mask=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32),
                               input_type_ids=tf.keras.Input(shape=(None, ),
                                                             dtype=tf.int32))
    def __init__(
            self,
            vocab_size: int,
            attention_window: Union[List[int], int] = 512,
            global_attention_size: int = 0,
            pad_token_id: int = 1,
            hidden_size: int = 768,
            num_layers: int = 12,
            num_attention_heads: int = 12,
            max_sequence_length: int = 512,
            type_vocab_size: int = 16,
            inner_dim: int = 3072,
            inner_activation: Callable[..., Any] = _approx_gelu,
            output_dropout: float = 0.1,
            attention_dropout: float = 0.1,
            initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            output_range: Optional[int] = None,
            embedding_width: Optional[int] = None,
            embedding_layer: Optional[tf.keras.layers.Layer] = None,
            norm_first: bool = False,
            **kwargs):
        super().__init__(**kwargs)
        # Longformer args
        self._attention_window = attention_window
        self._global_attention_size = global_attention_size
        self._pad_token_id = pad_token_id

        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=initializer,
                name='word_embeddings')
        else:
            self._embedding_layer = embedding_layer

        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')

        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=output_dropout, name='embedding_dropout')

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        self._embedding_projection = None
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')

        self._transformer_layers = []
        self._attention_mask_layer = layers.SelfAttentionMask(
            name='self_attention_mask')
        for i in range(num_layers):
            layer = LongformerEncoderBlock(
                global_attention_size=global_attention_size,
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                attention_window=attention_window[i],
                layer_id=i,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=output_range if i == num_layers - 1 else None,
                kernel_initializer=initializer,
                name=f'transformer/layer_{i}')
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')

        self._config = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
            'norm_first': norm_first,
            'attention_window': attention_window,
            'global_attention_size': global_attention_size,
            'pad_token_id': pad_token_id,
        }
        self.inputs = dict(input_word_ids=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32),
                           input_mask=tf.keras.Input(shape=(None, ),
                                                     dtype=tf.int32),
                           input_type_ids=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32))
Пример #24
0
  def __init__(
      self,
      num_output_classes,
      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(
          stddev=0.02),
      embedding_cls=None,
      embedding_cfg=None,
      embedding_data=None,
      num_hidden_instances=1,
      hidden_cls=layers.Transformer,
      hidden_cfg=None,
      **kwargs):
    print(embedding_cfg)
    self._self_setattr_tracking = False
    self._hidden_cls = hidden_cls
    self._hidden_cfg = hidden_cfg
    self._num_hidden_instances = num_hidden_instances
    self._num_output_classes = num_output_classes
    self._classification_layer_initializer = classification_layer_initializer
    self._embedding_cls = embedding_cls
    self._embedding_cfg = embedding_cfg
    self._embedding_data = embedding_data
    self._kwargs = kwargs

    if embedding_cls:
      if inspect.isclass(embedding_cls):
        self._embedding_network = embedding_cls(embedding_cfg)
      else:
        self._embedding_network = embedding_cls
      inputs = self._embedding_network.inputs
      embeddings, mask = self._embedding_network(inputs)
    else:
      self._embedding_network = None
      word_ids = tf.keras.layers.Input(
          shape=(embedding_cfg['seq_length'],),
          dtype=tf.int32,
          name='input_word_ids')
      mask = tf.keras.layers.Input(
          shape=(embedding_cfg['seq_length'],),
          dtype=tf.int32,
          name='input_mask')
      type_ids = tf.keras.layers.Input(
          shape=(embedding_cfg['seq_length'],),
          dtype=tf.int32,
          name='input_type_ids')
      inputs = [word_ids, mask, type_ids]

      self._embedding_layer = layers.OnDeviceEmbedding(
          vocab_size=embedding_cfg['vocab_size'],
          embedding_width=embedding_cfg['hidden_size'],
          initializer=embedding_cfg['initializer'],
          name='word_embeddings')

      word_embeddings = self._embedding_layer(word_ids)

      # Always uses dynamic slicing for simplicity.
      self._position_embedding_layer = layers.PositionEmbedding(
          initializer=embedding_cfg['initializer'],
          use_dynamic_slicing=True,
          max_sequence_length=embedding_cfg['max_seq_length'])
      position_embeddings = self._position_embedding_layer(word_embeddings)

      type_embeddings = (
          layers.OnDeviceEmbedding(
              vocab_size=embedding_cfg['type_vocab_size'],
              embedding_width=embedding_cfg['hidden_size'],
              initializer=embedding_cfg['initializer'],
              use_one_hot=True,
              name='type_embeddings')(type_ids))

      embeddings = tf.keras.layers.Add()(
          [word_embeddings, position_embeddings, type_embeddings])
      embeddings = (
          tf.keras.layers.LayerNormalization(
              name='embeddings/layer_norm',
              axis=-1,
              epsilon=1e-12,
              dtype=tf.float32)(embeddings))
      embeddings = (
          tf.keras.layers.Dropout(
              rate=embedding_cfg['dropout_rate'])(embeddings))

    attention_mask = layers.SelfAttentionMask()([embeddings, mask])
    data = embeddings

    for _ in range(num_hidden_instances):
      if inspect.isclass(hidden_cls):
        layer = self._hidden_cls(**hidden_cfg)
      else:
        layer = self._hidden_cls
      data = layer([data, attention_mask])

    first_token_tensor = (
        tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data)
    )
    cls_output = tf.keras.layers.Dense(
        units=num_output_classes,
        activation='tanh',
        kernel_initializer=classification_layer_initializer,
        name='cls_transform')(
            first_token_tensor)

    super(EncoderScaffold, self).__init__(
        inputs=inputs, outputs=[data, cls_output], **kwargs)
Пример #25
0
  def __init__(self,
               vocab_size,
               type_vocab_size,
               hidden_size,
               max_seq_length,
               initializer,
               dropout_rate,
               use_position_id=False,
               pack_multiple_sequences=False,
               **kwargs):
    initializer = tf.keras.initializers.get(initializer)
    config_dict = {
        'vocab_size': vocab_size,
        'type_vocab_size': type_vocab_size,
        'hidden_size': hidden_size,
        'max_seq_length': max_seq_length,
        'initializer': tf.keras.initializers.serialize(initializer),
        'dropout_rate': dropout_rate,
        'use_position_id': use_position_id,
        'pack_multiple_sequences': pack_multiple_sequences,
    }

    word_ids = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_word_ids')
    mask = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_mask')
    type_ids = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_type_ids')
    inputs = {
        'input_word_ids': word_ids,
        'input_mask': mask,
        'input_type_ids': type_ids,
    }
    if use_position_id:
      position_ids = tf.keras.layers.Input(
          shape=(None,), dtype=tf.int32, name='position_ids')
      inputs['position_ids'] = position_ids
    else:
      position_ids = None

    if pack_multiple_sequences:
      sub_seq_mask = PackedSequenceMask()(word_ids)
    else:
      sub_seq_mask = None

    embedding_layer = layers.OnDeviceEmbedding(
        vocab_size=vocab_size,
        embedding_width=hidden_size,
        initializer=initializer,
        name='word_embeddings')
    word_embeddings = embedding_layer(word_ids)

    # Always uses dynamic slicing for simplicity.
    position_embedding_layer = PositionEmbeddingWithSubSeqMask(
        initializer=initializer,
        use_dynamic_slicing=True,
        max_sequence_length=max_seq_length,
        name='position_embedding')
    position_embeddings = position_embedding_layer(
        word_embeddings, position_ids, sub_seq_mask)

    type_embeddings = (
        layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')(type_ids))

    embeddings = tf.keras.layers.Add()(
        [word_embeddings, position_embeddings, type_embeddings])
    embeddings = tf.keras.layers.LayerNormalization(
        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)(
            embeddings)
    embeddings = tf.keras.layers.Dropout(
        rate=dropout_rate, dtype=tf.float32)(
            embeddings)

    attention_mask = layers.SelfAttentionMask()([embeddings, mask])
    if sub_seq_mask is not None:
      attention_mask = tf.keras.layers.Lambda(
          lambda x: x[0] * tf.cast(x[1], x[0].dtype))(
              [attention_mask, sub_seq_mask])

    outputs = [embeddings, attention_mask]
    super(PackedSequenceEmbedding, self).__init__(
        inputs=inputs, outputs=outputs, **kwargs)
    # TF does not track immutable attrs which do not contain Trackables,
    # so by creating a config namedtuple instead of a dict we avoid tracking it.
    config_cls = collections.namedtuple('Config', config_dict.keys())
    self._config = config_cls(**config_dict)
    self._embedding_layer = embedding_layer
    self._position_embedding_layer = position_embedding_layer
Пример #26
0
    def test_network_invocation(self):
        hidden_size = 32
        sequence_length = 21
        vocab_size = 57

        # Build an embedding network to swap in for the default network. This one
        # will have 2 inputs (mask and word_ids) instead of 3, and won't use
        # positional embeddings.

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name="input_word_ids")
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name="input_mask")
        embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=hidden_size,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            name="word_embeddings")
        word_embeddings = embedding_layer(word_ids)
        network = tf.keras.Model([word_ids, mask], [word_embeddings, mask])

        hidden_cfg = {
            "num_attention_heads":
            2,
            "intermediate_size":
            3072,
            "intermediate_activation":
            activations.gelu,
            "dropout_rate":
            0.1,
            "attention_dropout_rate":
            0.1,
            "kernel_initializer":
            tf.keras.initializers.TruncatedNormal(stddev=0.02),
        }

        # Create a small EncoderScaffold for testing.
        test_network = encoder_scaffold.EncoderScaffold(
            num_hidden_instances=3,
            num_output_classes=hidden_size,
            classification_layer_initializer=tf.keras.initializers.
            TruncatedNormal(stddev=0.02),
            hidden_cfg=hidden_cfg,
            embedding_cls=network,
            embedding_data=embedding_layer.embeddings)

        # Create the inputs (note that the first dimension is implicit).
        word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32)
        data, pooled = test_network([word_ids, mask])

        # Create a model based off of this network:
        model = tf.keras.Model([word_ids, mask], [data, pooled])

        # Invoke the model. We can't validate the output data here (the model is too
        # complex) but this will catch structural runtime errors.
        batch_size = 3
        word_id_data = np.random.randint(vocab_size,
                                         size=(batch_size, sequence_length))
        mask_data = np.random.randint(2, size=(batch_size, sequence_length))
        _ = model.predict([word_id_data, mask_data])

        # Test that we can get the embedding data that we passed to the object. This
        # is necessary to support standard language model training.
        self.assertIs(embedding_layer.embeddings,
                      test_network.get_embedding_table())
Пример #27
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            return_all_encoder_outputs=False,
            output_range=None,
            embedding_width=None,
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'return_all_encoder_outputs': return_all_encoder_outputs,
            'output_range': output_range,
            'embedding_width': embedding_width,
        }

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        if embedding_width is None:
            embedding_width = hidden_size
        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=max_sequence_length,
            name='position_embedding')
        position_embeddings = self._position_embedding_layer(word_embeddings)
        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')
        type_embeddings = self._type_embedding_layer(type_ids)

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])

        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')
            embeddings = self._embedding_projection(embeddings)

        self._transformer_layers = []
        data = embeddings
        attention_mask = layers.SelfAttentionMask()([data, mask])
        encoder_outputs = []
        for i in range(num_layers):
            if i == num_layers - 1 and output_range is not None:
                transformer_output_range = output_range
            else:
                transformer_output_range = None
            layer = layers.Transformer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                output_range=transformer_output_range,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)
            data = layer([data, attention_mask])
            encoder_outputs.append(data)

        first_token_tensor = (
            tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(
                encoder_outputs[-1]))
        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')
        cls_output = self._pooler_layer(first_token_tensor)

        if return_all_encoder_outputs:
            outputs = [encoder_outputs, cls_output]
        else:
            outputs = [encoder_outputs[-1], cls_output]

        super(TransformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=outputs,
                             **kwargs)
Пример #28
0
    def __init__(
            self,
            vocab_size: int,
            hidden_size: int = 768,
            num_layers: int = 12,
            num_attention_heads: int = 12,
            max_sequence_length: int = 512,
            type_vocab_size: int = 16,
            inner_dim: int = 3072,
            inner_activation: _Activation = _approx_gelu,
            output_dropout: float = 0.1,
            attention_dropout: float = 0.1,
            token_loss_init_value: float = 10.0,
            token_loss_beta: float = 0.995,
            token_keep_k: int = 256,
            token_allow_list: Tuple[int, ...] = (100, 101, 102, 103),
            token_deny_list: Tuple[int, ...] = (0, ),
            initializer: _Initializer = tf.keras.initializers.TruncatedNormal(
                stddev=0.02),
            output_range: Optional[int] = None,
            embedding_width: Optional[int] = None,
            embedding_layer: Optional[tf.keras.layers.Layer] = None,
            norm_first: bool = False,
            with_dense_inputs: bool = False,
            **kwargs):
        # Pops kwargs that are used in V1 implementation.
        if 'dict_outputs' in kwargs:
            kwargs.pop('dict_outputs')
        if 'return_all_encoder_outputs' in kwargs:
            kwargs.pop('return_all_encoder_outputs')
        if 'intermediate_size' in kwargs:
            inner_dim = kwargs.pop('intermediate_size')
        if 'activation' in kwargs:
            inner_activation = kwargs.pop('activation')
        if 'dropout_rate' in kwargs:
            output_dropout = kwargs.pop('dropout_rate')
        if 'attention_dropout_rate' in kwargs:
            attention_dropout = kwargs.pop('attention_dropout_rate')
        super().__init__(**kwargs)

        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=tf_utils.clone_initializer(initializer),
                name='word_embeddings')
        else:
            self._embedding_layer = embedding_layer

        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=tf_utils.clone_initializer(initializer),
            max_length=max_sequence_length,
            name='position_embedding')

        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=tf_utils.clone_initializer(initializer),
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=output_dropout, name='embedding_dropout')

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        self._embedding_projection = None
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=tf_utils.clone_initializer(initializer),
                name='embedding_projection')

        # The first 999 tokens are special tokens such as [PAD], [CLS], [SEP].
        # We want to always mask [PAD], and always not to maks [CLS], [SEP].
        init_importance = tf.constant(token_loss_init_value,
                                      shape=(vocab_size))
        if token_allow_list:
            init_importance = tf.tensor_scatter_nd_update(
                tensor=init_importance,
                indices=[[x] for x in token_allow_list],
                updates=[1.0e4 for x in token_allow_list])
        if token_deny_list:
            init_importance = tf.tensor_scatter_nd_update(
                tensor=init_importance,
                indices=[[x] for x in token_deny_list],
                updates=[-1.0e4 for x in token_deny_list])
        self._token_importance_embed = layers.TokenImportanceWithMovingAvg(
            vocab_size=vocab_size,
            init_importance=init_importance,
            moving_average_beta=token_loss_beta)

        self._token_separator = layers.SelectTopK(top_k=token_keep_k)
        self._transformer_layers = []
        self._num_layers = num_layers
        self._attention_mask_layer = layers.SelfAttentionMask(
            name='self_attention_mask')
        for i in range(num_layers):
            layer = layers.TransformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=output_range if i == num_layers - 1 else None,
                kernel_initializer=tf_utils.clone_initializer(initializer),
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=tf_utils.clone_initializer(initializer),
            name='pooler_transform')

        self._config = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'token_loss_init_value': token_loss_init_value,
            'token_loss_beta': token_loss_beta,
            'token_keep_k': token_keep_k,
            'token_allow_list': token_allow_list,
            'token_deny_list': token_deny_list,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
            'norm_first': norm_first,
            'with_dense_inputs': with_dense_inputs,
        }
        if with_dense_inputs:
            self.inputs = dict(
                input_word_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                input_mask=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                input_type_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                dense_inputs=tf.keras.Input(shape=(None, embedding_width),
                                            dtype=tf.float32),
                dense_mask=tf.keras.Input(shape=(None, ), dtype=tf.int32),
                dense_type_ids=tf.keras.Input(shape=(None, ), dtype=tf.int32),
            )
        else:
            self.inputs = dict(input_word_ids=tf.keras.Input(shape=(None, ),
                                                             dtype=tf.int32),
                               input_mask=tf.keras.Input(shape=(None, ),
                                                         dtype=tf.int32),
                               input_type_ids=tf.keras.Input(shape=(None, ),
                                                             dtype=tf.int32))
Пример #29
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            sequence_length=512,
            max_sequence_length=None,
            type_vocab_size=16,
            intermediate_size=3072,
            activation=activations.gelu,
            dropout_rate=0.1,
            attention_dropout_rate=0.1,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            float_dtype='float32',
            **kwargs):
        activation = tf.keras.activations.get(activation)
        initializer = tf.keras.initializers.get(initializer)

        if not max_sequence_length:
            max_sequence_length = sequence_length
        self._self_setattr_tracking = False
        self._config_dict = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'sequence_length': sequence_length,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'intermediate_size': intermediate_size,
            'activation': tf.keras.activations.serialize(activation),
            'dropout_rate': dropout_rate,
            'attention_dropout_rate': attention_dropout_rate,
            'initializer': tf.keras.initializers.serialize(initializer),
            'float_dtype': float_dtype,
        }

        word_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_word_ids')
        mask = tf.keras.layers.Input(shape=(sequence_length, ),
                                     dtype=tf.int32,
                                     name='input_mask')
        type_ids = tf.keras.layers.Input(shape=(sequence_length, ),
                                         dtype=tf.int32,
                                         name='input_type_ids')

        self._embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            name='word_embeddings')
        word_embeddings = self._embedding_layer(word_ids)

        # Always uses dynamic slicing for simplicity.
        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            use_dynamic_slicing=True,
            max_sequence_length=max_sequence_length)
        position_embeddings = self._position_embedding_layer(word_embeddings)

        type_embeddings = (layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=hidden_size,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')(type_ids))

        embeddings = tf.keras.layers.Add()(
            [word_embeddings, position_embeddings, type_embeddings])
        embeddings = (tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)(embeddings))
        embeddings = (tf.keras.layers.Dropout(rate=dropout_rate,
                                              dtype=tf.float32)(embeddings))

        if float_dtype == 'float16':
            embeddings = tf.cast(embeddings, tf.float16)

        data = embeddings
        attention_mask = MakeAttentionMaskLayer()([data, mask])
        for i in range(num_layers):
            layer = layers.Transformer(
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                intermediate_activation=activation,
                dropout_rate=dropout_rate,
                attention_dropout_rate=attention_dropout_rate,
                kernel_initializer=initializer,
                dtype=float_dtype,
                name='transformer/layer_%d' % i)
            data = layer([data, attention_mask])

        first_token_tensor = (tf.keras.layers.Lambda(
            lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data))
        cls_output = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')(first_token_tensor)

        super(TransformerEncoder,
              self).__init__(inputs=[word_ids, mask, type_ids],
                             outputs=[data, cls_output],
                             **kwargs)
Пример #30
0
    def __init__(
            self,
            vocab_size,
            hidden_size=768,
            num_layers=12,
            num_attention_heads=12,
            max_sequence_length=512,
            type_vocab_size=16,
            inner_dim=3072,
            inner_activation=lambda x: tf.keras.activations.gelu(
                x, approximate=True),
            output_dropout=0.1,
            attention_dropout=0.1,
            pool_type='max',
            pool_stride=2,
            unpool_length=0,
            initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
            output_range=None,
            embedding_width=None,
            embedding_layer=None,
            norm_first=False,
            **kwargs):
        super().__init__(**kwargs)
        activation = tf.keras.activations.get(inner_activation)
        initializer = tf.keras.initializers.get(initializer)

        if embedding_width is None:
            embedding_width = hidden_size

        if embedding_layer is None:
            self._embedding_layer = layers.OnDeviceEmbedding(
                vocab_size=vocab_size,
                embedding_width=embedding_width,
                initializer=initializer,
                name='word_embeddings')
        else:
            self._embedding_layer = embedding_layer

        self._position_embedding_layer = layers.PositionEmbedding(
            initializer=initializer,
            max_length=max_sequence_length,
            name='position_embedding')

        self._type_embedding_layer = layers.OnDeviceEmbedding(
            vocab_size=type_vocab_size,
            embedding_width=embedding_width,
            initializer=initializer,
            use_one_hot=True,
            name='type_embeddings')

        self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
            name='embeddings/layer_norm',
            axis=-1,
            epsilon=1e-12,
            dtype=tf.float32)

        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=output_dropout, name='embedding_dropout')

        # We project the 'embedding' output to 'hidden_size' if it is not already
        # 'hidden_size'.
        self._embedding_projection = None
        if embedding_width != hidden_size:
            self._embedding_projection = tf.keras.layers.experimental.EinsumDense(
                '...x,xy->...y',
                output_shape=hidden_size,
                bias_axes='y',
                kernel_initializer=initializer,
                name='embedding_projection')

        self._transformer_layers = []
        self._attention_mask_layer = layers.SelfAttentionMask(
            name='self_attention_mask')
        for i in range(num_layers):
            layer = layers.TransformerEncoderBlock(
                num_attention_heads=num_attention_heads,
                inner_dim=inner_dim,
                inner_activation=inner_activation,
                output_dropout=output_dropout,
                attention_dropout=attention_dropout,
                norm_first=norm_first,
                output_range=output_range if i == num_layers - 1 else None,
                kernel_initializer=initializer,
                name='transformer/layer_%d' % i)
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=hidden_size,
            activation='tanh',
            kernel_initializer=initializer,
            name='pooler_transform')
        if isinstance(pool_stride, int):
            # TODO(b/197133196): Pooling layer can be shared.
            pool_strides = [pool_stride] * num_layers
        else:
            if len(pool_stride) != num_layers:
                raise ValueError(
                    'Lengths of pool_stride and num_layers are not equal.')
            pool_strides = pool_stride
        # TODO(crickwu): explore tf.keras.layers.serialize method.
        if pool_type == 'max':
            pool_cls = tf.keras.layers.MaxPooling1D
        elif pool_type == 'avg':
            pool_cls = tf.keras.layers.AveragePooling1D
        else:
            raise ValueError('pool_type not supported.')
        self._att_input_pool_layers = []
        for layer_pool_stride in pool_strides:
            att_input_pool_layer = pool_cls(pool_size=layer_pool_stride,
                                            strides=layer_pool_stride,
                                            padding='same',
                                            name='att_input_pool_layer')
            self._att_input_pool_layers.append(att_input_pool_layer)

        self._pool_strides = pool_strides  # This is a list here.
        self._unpool_length = unpool_length

        self._config = {
            'vocab_size': vocab_size,
            'hidden_size': hidden_size,
            'num_layers': num_layers,
            'num_attention_heads': num_attention_heads,
            'max_sequence_length': max_sequence_length,
            'type_vocab_size': type_vocab_size,
            'inner_dim': inner_dim,
            'inner_activation': tf.keras.activations.serialize(activation),
            'output_dropout': output_dropout,
            'attention_dropout': attention_dropout,
            'initializer': tf.keras.initializers.serialize(initializer),
            'output_range': output_range,
            'embedding_width': embedding_width,
            'embedding_layer': embedding_layer,
            'norm_first': norm_first,
            'pool_type': pool_type,
            'pool_stride': pool_stride,
            'unpool_length': unpool_length,
        }