def __init__(self, num_hidden_layers=12, hidden_size=768, num_attention_heads=12, intermediate_size=3072, intermediate_activation="gelu", hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, initializer_range=0.02, backward_compatible=False, float_type=tf.float32, shared_type="all", **kwargs): super(Transformer, self).__init__(**kwargs) self.num_hidden_layers = num_hidden_layers self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size self.intermediate_activation = tf_utils.get_activation( intermediate_activation) self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.initializer_range = initializer_range self.backward_compatible = backward_compatible self.float_type = float_type self.shared_type = shared_type
def __init__(self, hidden_size=768, num_attention_heads=12, intermediate_size=3072, intermediate_activation="gelu", hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, initializer_range=0.02, backward_compatible=False, float_type=tf.float32, **kwargs): super(TransformerBlock, self).__init__(**kwargs) self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size self.intermediate_activation = tf_utils.get_activation( intermediate_activation) self.hidden_dropout_prob = hidden_dropout_prob self.attention_probs_dropout_prob = attention_probs_dropout_prob self.initializer_range = initializer_range self.backward_compatible = backward_compatible self.float_type = float_type if self.hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (self.hidden_size, self.num_attention_heads)) self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
def _get_transformer_encoder(bert_config, sequence_length, float_dtype=tf.float32): """Gets a 'TransformerEncoder' object. Args: bert_config: A 'modeling.BertConfig' object. sequence_length: Maximum sequence length of the training data. float_dtype: tf.dtype, tf.float32 or tf.float16. Returns: A networks.TransformerEncoder object. """ return networks.TransformerEncoder( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation('gelu'), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=sequence_length, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range), float_dtype=float_dtype.name)
def build(self, unused_input_shapes): """Implements build() for the layer.""" self.output_bias = self.add_weight( shape=[self.config.vocab_size], name='predictions/output_bias', initializer=tf.keras.initializers.Zeros()) self.lm_dense = tf.keras.layers.Dense( self.config.embedding_size, activation=tf_utils.get_activation(self.config.hidden_act), kernel_initializer=self.initializer, name='predictions/transform/dense') self.lm_layer_norm = tf.keras.layers.LayerNormalization( axis=-1, epsilon=1e-12, name='predictions/transform/LayerNorm') # Next sentence binary classification dense layer including bias to match # TF1.x BERT variable shapes. with tf.name_scope('seq_relationship'): self.next_seq_weights = self.add_weight( shape=[self.num_next_sentence_label, self.config.hidden_size], name='output_weights', initializer=self.initializer) self.next_seq_bias = self.add_weight( shape=[self.num_next_sentence_label], name='output_bias', initializer=tf.keras.initializers.Zeros()) super(TinyBertPretrainLayer, self).build(unused_input_shapes)
def get_transformer_encoder(bert_config, sequence_length): """get transformer encoder model """ kwargs = dict( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=sequence_length, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range), name="transformer_encoder") if isinstance(bert_config, AlbertConfig): kwargs['embedding_width'] = bert_config.embedding_size kwargs['num_hidden_groups'] = bert_config.num_hidden_groups return layers.AlbertTransformerEncoder(**kwargs) else: assert isinstance(bert_config, BertConfig) return layers.TransformerEncoder(**kwargs)