def __init__( self, bert_layer, **kwargs, ): super().__init__(**kwargs) self.bert_layer = bert_layer self.bert_config = self.bert_layer.params assert not self.is_hf, "TODO: Support RoBERTa MLM models." # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. ns = f"{self.bert_layer.name}/cls/predictions/transform" self.dense = tf.keras.layers.Dense( units=self.hidden_size, # TODO: Add the intitializer. # kernel_initializer=self.create_initializer(), name=f"{ns}/dense", ) self.layer_norm = pf.LayerNormalization(name=f"{ns}/LayerNorm") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.output_bias = self.add_weight( f"{self.name}/{self.bert_layer.name}/cls/predictions/output_bias", shape=[self.vocab_size], initializer=tf.zeros_initializer(), )
def build(self, input_shape): if isinstance(input_shape, list): assert len(input_shape) == 2 input_ids_shape, token_type_ids_shape = input_shape self.input_spec = [keras.layers.InputSpec(shape=input_ids_shape), keras.layers.InputSpec(shape=token_type_ids_shape)] else: input_ids_shape = input_shape self.input_spec = keras.layers.InputSpec(shape=input_ids_shape) # use either hidden_size for BERT or embedding_size for ALBERT embedding_size = self.params.hidden_size if self.params.embedding_size is None else self.params.embedding_size self.word_embeddings_layer = keras.layers.Embedding( input_dim=self.params.vocab_size, output_dim=embedding_size, mask_zero=True, name="word_embeddings" ) if self.params.extra_tokens_vocab_size is not None: self.extra_word_embeddings_layer = keras.layers.Embedding( input_dim=self.params.extra_tokens_vocab_size + 1, # +1 is for a <pad>/0 vector output_dim=embedding_size, mask_zero=True, name="extra_word_embeddings" ) if self.params.embedding_size is not None: # ALBERT word embeddings projection self.word_embeddings_2_layer = self.add_weight(name="word_embeddings_2/embeddings", shape=[self.params.embedding_size, self.params.hidden_size], dtype=K.floatx()) if self.params.project_embeddings_with_bias: self.word_embeddings_2_layer_bias = self.add_weight( name="word_embeddings_2/bias", shape=[self.params.hidden_size], dtype=K.floatx()) position_embedding_size = embedding_size if self.params.project_position_embeddings else self.params.hidden_size if self.params.use_token_type: self.token_type_embeddings_layer = keras.layers.Embedding( input_dim=self.params.token_type_vocab_size, output_dim=position_embedding_size, mask_zero=False, name="token_type_embeddings" ) if self.params.use_position_embeddings: self.position_embeddings_layer = PositionEmbeddingLayer.from_params( self.params, name="position_embeddings", hidden_size=position_embedding_size ) self.layer_norm_layer = pf.LayerNormalization(name="LayerNorm") self.dropout_layer = keras.layers.Dropout(rate=self.params.hidden_dropout) super(BertEmbeddingsLayer, self).build(input_shape)
def build(self, input_shape): if isinstance(input_shape, list): assert len(input_shape) == 2 input_ids_shape, token_type_ids_shape = input_shape self.input_spec = [tf.keras.layers.InputSpec(shape=input_ids_shape), tf.keras.layers.InputSpec(shape=token_type_ids_shape)] else: input_ids_shape = input_shape self.input_spec = tf.keras.layers.InputSpec(shape=input_ids_shape) # use either hidden_size for BERT or embedding_size for ALBERT embedding_size = self.params.hidden_size if self.params.embedding_size is None else self.params.embedding_size self.word_embeddings_layer = tf.keras.layers.Embedding( input_dim=self.params.vocab_size, output_dim=embedding_size, mask_zero=self.params.mask_zero, name="word_embeddings" ) if self.params.extra_tokens_vocab_size is not None: self.extra_word_embeddings_layer = tf.keras.layers.Embedding( input_dim=self.params.extra_tokens_vocab_size + 1, # +1 is for a <pad>/0 vector output_dim=embedding_size, mask_zero=self.params.mask_zero, embeddings_initializer=self.create_initializer(), name="extra_word_embeddings" ) # ALBERT word embeddings projection if self.params.embedding_size is not None: self.word_embeddings_projector_layer = EmbeddingsProjector.from_params( self.params, name="word_embeddings_projector") position_embedding_size = embedding_size if self.params.project_position_embeddings else self.params.hidden_size if self.params.use_token_type: self.token_type_embeddings_layer = tf.keras.layers.Embedding( input_dim=self.params.token_type_vocab_size, output_dim=position_embedding_size, mask_zero=False, name="token_type_embeddings" ) if self.params.use_position_embeddings: self.position_embeddings_layer = PositionEmbeddingLayer.from_params( self.params, name="position_embeddings", hidden_size=position_embedding_size ) self.layer_norm_layer = pf.LayerNormalization(name="LayerNorm") self.dropout_layer = tf.keras.layers.Dropout(rate=self.params.hidden_dropout) super(BertEmbeddingsLayer, self).build(input_shape)