def test_serialize_deserialize(self): """Validate that the ELECTRA trainer can be serialized and deserialized.""" # Build a transformer network to use within the BERT trainer. (Here, we use # a short sequence_length for convenience.) test_generator_network = networks.TransformerEncoder( vocab_size=100, num_layers=4, sequence_length=3) test_discriminator_network = networks.TransformerEncoder( vocab_size=100, num_layers=4, sequence_length=3) # Create a ELECTRA trainer with the created network. (Note that all the args # are different, so we can catch any serialization mismatches.) electra_trainer_model = electra_pretrainer.ElectraPretrainer( generator_network=test_generator_network, discriminator_network=test_discriminator_network, vocab_size=100, num_classes=2, sequence_length=3, num_token_predictions=2) # Create another BERT trainer via serialization and deserialization. config = electra_trainer_model.get_config() new_electra_trainer_model = electra_pretrainer.ElectraPretrainer.from_config( config) # Validate that the config can be forced to JSON. _ = new_electra_trainer_model.to_json() # If the serialization was successful, the new config should match the old. self.assertAllEqual(electra_trainer_model.get_config(), new_electra_trainer_model.get_config())
def test_electra_pretrainer(self): """Validate that the Keras object can be created.""" # Build a transformer network to use within the ELECTRA trainer. vocab_size = 100 sequence_length = 512 test_generator_network = networks.TransformerEncoder( vocab_size=vocab_size, num_layers=2, max_sequence_length=sequence_length) test_discriminator_network = networks.TransformerEncoder( vocab_size=vocab_size, num_layers=2, max_sequence_length=sequence_length) # Create a ELECTRA trainer with the created network. num_classes = 3 num_token_predictions = 2 eletrca_trainer_model = electra_pretrainer.ElectraPretrainer( generator_network=test_generator_network, discriminator_network=test_discriminator_network, vocab_size=vocab_size, num_classes=num_classes, num_token_predictions=num_token_predictions, disallow_correct=True) # Create a set of 2-dimensional inputs (the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) lm_positions = tf.keras.Input(shape=(num_token_predictions, ), dtype=tf.int32) lm_ids = tf.keras.Input(shape=(num_token_predictions, ), dtype=tf.int32) inputs = { 'input_word_ids': word_ids, 'input_mask': mask, 'input_type_ids': type_ids, 'masked_lm_positions': lm_positions, 'masked_lm_ids': lm_ids } # Invoke the trainer model on the inputs. This causes the layer to be built. outputs = eletrca_trainer_model(inputs) lm_outs = outputs['lm_outputs'] cls_outs = outputs['sentence_outputs'] disc_logits = outputs['disc_logits'] disc_label = outputs['disc_label'] # Validate that the outputs are of the expected shape. expected_lm_shape = [None, num_token_predictions, vocab_size] expected_classification_shape = [None, num_classes] expected_disc_logits_shape = [None, sequence_length] expected_disc_label_shape = [None, sequence_length] self.assertAllEqual(expected_lm_shape, lm_outs.shape.as_list()) self.assertAllEqual(expected_classification_shape, cls_outs.shape.as_list()) self.assertAllEqual(expected_disc_logits_shape, disc_logits.shape.as_list()) self.assertAllEqual(expected_disc_label_shape, disc_label.shape.as_list())
def test_bert_trainer(self): """Validate that the Keras object can be created.""" # Build a transformer network to use within the BERT trainer. vocab_size = 100 sequence_length = 512 test_network = networks.TransformerEncoder( vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length) # Create a BERT trainer with the created network. num_classes = 3 bert_trainer_model = bert_token_classifier.BertTokenClassifier( test_network, num_classes=num_classes) # Create a set of 2-dimensional inputs (the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) # Invoke the trainer model on the inputs. This causes the layer to be built. sequence_outs = bert_trainer_model([word_ids, mask, type_ids]) # Validate that the outputs are of the expected shape. expected_classification_shape = [None, sequence_length, num_classes] self.assertAllEqual(expected_classification_shape, sequence_outs.shape.as_list())
def create_lm_model(self, vocab_size, sequence_length, hidden_size, num_predictions, output="predictions"): # First, create a transformer stack that we can use to get the LM's # vocabulary weight. xformer_stack = networks.TransformerEncoder( vocab_size=vocab_size, num_layers=1, sequence_length=sequence_length, hidden_size=hidden_size, num_attention_heads=4, ) word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) _ = xformer_stack([word_ids, mask, type_ids]) # Create a maskedLM from the transformer stack. test_layer = layers.MaskedLM( embedding_table=xformer_stack.get_embedding_table(), output=output) # Create a model from the masked LM layer. lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size)) masked_lm_positions = tf.keras.Input(shape=(num_predictions, ), dtype=tf.int32) output = test_layer(lm_input_tensor, masked_positions=masked_lm_positions) return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
def test_bert_trainer(self): """Validate that the Keras object can be created.""" # Build a transformer network to use within the BERT trainer. vocab_size = 100 sequence_length = 512 test_network = networks.TransformerEncoder( vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length) # Create a BERT trainer with the created network. bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network) # Create a set of 2-dimensional inputs (the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) # Invoke the trainer model on the inputs. This causes the layer to be built. cls_outs = bert_trainer_model([word_ids, mask, type_ids]) # Validate that there are 2 outputs are of the expected shape. self.assertEqual(2, len(cls_outs)) expected_shape = [None, sequence_length] for out in cls_outs: self.assertAllEqual(expected_shape, out.shape.as_list())
def _create_bert_model(cfg): """Creates a BERT keras core model from BERT configuration. Args: cfg: A `BertConfig` to create the core model. Returns: A TransformerEncoder netowork. """ bert_encoder = networks.TransformerEncoder( vocab_size=cfg.vocab_size, hidden_size=cfg.hidden_size, num_layers=cfg.num_hidden_layers, num_attention_heads=cfg.num_attention_heads, intermediate_size=cfg.intermediate_size, activation=activations.gelu, dropout_rate=cfg.hidden_dropout_prob, attention_dropout_rate=cfg.attention_probs_dropout_prob, <<<<<<< HEAD sequence_length=cfg.max_position_embeddings, ======= max_sequence_length=cfg.max_position_embeddings, >>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36 type_vocab_size=cfg.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=cfg.initializer_range), embedding_width=cfg.embedding_size) return bert_encoder
def test_serialize_deserialize(self): """Validate that the BERT trainer can be serialized and deserialized.""" # Build a transformer network to use within the BERT trainer. (Here, we use # a short sequence_length for convenience.) test_network = networks.TransformerEncoder(vocab_size=100, num_layers=2, sequence_length=5) # Create a BERT trainer with the created network. (Note that all the args # are different, so we can catch any serialization mismatches.) bert_trainer_model = bert_token_classifier.BertTokenClassifier( test_network, num_classes=4, initializer='zeros', output='predictions') # Create another BERT trainer via serialization and deserialization. config = bert_trainer_model.get_config() new_bert_trainer_model = ( bert_token_classifier.BertTokenClassifier.from_config(config)) # Validate that the config can be forced to JSON. _ = new_bert_trainer_model.to_json() # If the serialization was successful, the new config should match the old. self.assertAllEqual(bert_trainer_model.get_config(), new_bert_trainer_model.get_config())
def get_transformer_encoder(bert_config, sequence_length): """Gets a 'TransformerEncoder' object. Args: bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object. sequence_length: Maximum sequence length of the training data. Returns: A networks.TransformerEncoder object. """ kwargs = dict( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=sequence_length, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) if isinstance(bert_config, albert_configs.AlbertConfig): kwargs['embedding_width'] = bert_config.embedding_size return networks.AlbertTransformerEncoder(**kwargs) else: assert isinstance(bert_config, configs.BertConfig) return networks.TransformerEncoder(**kwargs)
def instantiate_from_cfg(config: BertPretrainerConfig, encoder_network: Optional[tf.keras.Model] = None): """Instantiates a BertPretrainer from the config.""" encoder_cfg = config.encoder if encoder_network is None: encoder_network = networks.TransformerEncoder( vocab_size=encoder_cfg.vocab_size, hidden_size=encoder_cfg.hidden_size, num_layers=encoder_cfg.num_layers, num_attention_heads=encoder_cfg.num_attention_heads, intermediate_size=encoder_cfg.intermediate_size, activation=tf_utils.get_activation(encoder_cfg.hidden_activation), dropout_rate=encoder_cfg.dropout_rate, attention_dropout_rate=encoder_cfg.attention_dropout_rate, max_sequence_length=encoder_cfg.max_position_embeddings, type_vocab_size=encoder_cfg.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=encoder_cfg.initializer_range)) if config.cls_heads: classification_heads = [ layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads ] else: classification_heads = [] return bert_pretrainer.BertPretrainerV2( config.num_masked_tokens, mlm_initializer=tf.keras.initializers.TruncatedNormal( stddev=encoder_cfg.initializer_range), encoder_network=encoder_network, classification_heads=classification_heads)
def test_dual_encoder_tensor_call(self, hidden_size, output): """Validate that the Keras object can be invoked.""" # Build a transformer network to use within the dual encoder model. (Here, # we use # a short sequence_length for convenience.) sequence_length = 2 test_network = networks.TransformerEncoder( vocab_size=100, num_layers=2, sequence_length=sequence_length) # Create a dual encoder model with the created network. dual_encoder_model = dual_encoder.DualEncoder( test_network, max_seq_length=sequence_length, output=output) # Create a set of 2-dimensional data tensors to feed into the model. word_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32) mask = tf.constant([[1, 1], [1, 0]], dtype=tf.int32) type_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32) # Invoke the model model on the tensors. In Eager mode, this does the # actual calculation. (We can't validate the outputs, since the network is # too complex: this simply ensures we're not hitting runtime errors.) if output == 'logits': _ = dual_encoder_model( [word_ids, mask, type_ids, word_ids, mask, type_ids]) elif output == 'predictions': _ = dual_encoder_model([word_ids, mask, type_ids])
def _get_transformer_encoder(bert_config, sequence_length, float_dtype=tf.float32): """Gets a 'TransformerEncoder' object. Args: bert_config: A 'modeling.BertConfig' object. sequence_length: Maximum sequence length of the training data. float_dtype: tf.dtype, tf.float32 or tf.float16. Returns: A networks.TransformerEncoder object. """ return networks.TransformerEncoder( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=sequence_length, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range), float_dtype=float_dtype.name)
def test_bert_pretrainerv2(self): """Validate that the Keras object can be created.""" # Build a transformer network to use within the BERT trainer. vocab_size = 100 sequence_length = 512 test_network = networks.TransformerEncoder( vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length) # Create a BERT trainer with the created network. bert_trainer_model = bert_pretrainer.BertPretrainerV2( encoder_network=test_network) num_token_predictions = 20 # Create a set of 2-dimensional inputs (the first dimension is implicit). word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) lm_mask = tf.keras.Input(shape=(num_token_predictions, ), dtype=tf.int32) # Invoke the trainer model on the inputs. This causes the layer to be built. outputs = bert_trainer_model([word_ids, mask, type_ids, lm_mask]) # Validate that the outputs are of the expected shape. expected_lm_shape = [None, num_token_predictions, vocab_size] self.assertAllEqual(expected_lm_shape, outputs['lm_output'].shape.as_list())
def get_transformer_encoder(bert_config, sequence_length, transformer_encoder_cls=None): """Gets a 'TransformerEncoder' object. Args: bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object. sequence_length: Maximum sequence length of the training data. transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the default BERT encoder implementation. Returns: A networks.TransformerEncoder object. """ if transformer_encoder_cls is not None: # TODO(hongkuny): evaluate if it is better to put cfg definition in gin. embedding_cfg = dict( vocab_size=bert_config.vocab_size, type_vocab_size=bert_config.type_vocab_size, hidden_size=bert_config.hidden_size, seq_length=sequence_length, max_seq_length=bert_config.max_position_embeddings, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range), dropout_rate=bert_config.hidden_dropout_prob, ) hidden_cfg = dict( num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, ) kwargs = dict(embedding_cfg=embedding_cfg, hidden_cfg=hidden_cfg, num_hidden_instances=bert_config.num_hidden_layers,) # Relies on gin configuration to define the Transformer encoder arguments. return transformer_encoder_cls(**kwargs) kwargs = dict( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=sequence_length, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) if isinstance(bert_config, albert_configs.AlbertConfig): kwargs['embedding_width'] = bert_config.embedding_size return networks.AlbertTransformerEncoder(**kwargs) else: assert isinstance(bert_config, configs.BertConfig) return networks.TransformerEncoder(**kwargs)
def get_nhnet_layers(params: configs.NHNetConfig): """Creates a Mult-doc encoder/decoder. Args: params: ParamsDict. Returns: two keras Layers, bert_model_layer and decoder_layer """ input_ids = tf.keras.layers.Input( shape=(None,), name="input_ids", dtype=tf.int32) input_mask = tf.keras.layers.Input( shape=(None,), name="input_mask", dtype=tf.int32) segment_ids = tf.keras.layers.Input( shape=(None,), name="segment_ids", dtype=tf.int32) bert_config = utils.get_bert_config_from_params(params) bert_model_layer = networks.TransformerEncoder( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=None, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range), return_all_encoder_outputs=True, name="bert_encoder") bert_model_layer([input_ids, input_mask, segment_ids]) input_ids = tf.keras.layers.Input( shape=(None, None), name="input_ids", dtype=tf.int32) all_encoder_outputs = tf.keras.layers.Input((None, None, params.hidden_size), dtype=tf.float32) target_ids = tf.keras.layers.Input( shape=(None,), name="target_ids", dtype=tf.int32) doc_attention_probs = tf.keras.layers.Input( (params.num_decoder_attn_heads, None, None), dtype=tf.float32) # pylint: disable=protected-access decoder_layer = decoder.Decoder(params, bert_model_layer._embedding_layer) # pylint: enable=protected-access cross_attention_bias = decoder.AttentionBias(bias_type="multi_cross")( input_ids) self_attention_bias = decoder.AttentionBias(bias_type="decoder_self")( target_ids) decoder_inputs = dict( attention_bias=cross_attention_bias, self_attention_bias=self_attention_bias, target_ids=target_ids, all_encoder_outputs=all_encoder_outputs, doc_attention_probs=doc_attention_probs) _ = decoder_layer(decoder_inputs) return bert_model_layer, decoder_layer
def get_bert2bert_layers(params: configs.BERT2BERTConfig): """Creates a Bert2Bert stem model and returns Bert encoder/decoder. We use funtional-style to create stem model because we need to make all layers built to restore variables in a customized way. The layers are called with placeholder inputs to make them fully built. Args: params: ParamsDict. Returns: two keras Layers, bert_model_layer and decoder_layer """ input_ids = tf.keras.layers.Input( shape=(None,), name="input_ids", dtype=tf.int32) input_mask = tf.keras.layers.Input( shape=(None,), name="input_mask", dtype=tf.int32) segment_ids = tf.keras.layers.Input( shape=(None,), name="segment_ids", dtype=tf.int32) target_ids = tf.keras.layers.Input( shape=(None,), name="target_ids", dtype=tf.int32) bert_config = utils.get_bert_config_from_params(params) bert_model_layer = networks.TransformerEncoder( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, <<<<<<< HEAD sequence_length=None, ======= >>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36 max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range), return_all_encoder_outputs=True, name="bert_encoder") all_encoder_outputs, _ = bert_model_layer( [input_ids, input_mask, segment_ids]) # pylint: disable=protected-access decoder_layer = decoder.Decoder(params, bert_model_layer._embedding_layer) # pylint: enable=protected-access cross_attention_bias = decoder.AttentionBias(bias_type="single_cross")( input_ids) self_attention_bias = decoder.AttentionBias(bias_type="decoder_self")( target_ids) decoder_inputs = dict( attention_bias=cross_attention_bias, self_attention_bias=self_attention_bias, target_ids=target_ids, all_encoder_outputs=all_encoder_outputs) _ = decoder_layer(decoder_inputs) return bert_model_layer, decoder_layer
def __init__(self, context_feature_columns, example_feature_columns, bert_config_file, bert_max_seq_length, bert_output_dropout, name="tfrbert", **kwargs): """Initializes an instance of TFRBertRankingNetwork. Args: context_feature_columns: A dict containing all the context feature columns used by the network. Keys are feature names, and values are instances of classes derived from `_FeatureColumn`. example_feature_columns: A dict containing all the example feature columns used by the network. Keys are feature names, and values are instances of classes derived from `_FeatureColumn`. bert_config_file: (string) path to Bert configuration file. bert_max_seq_length: (int) maximum input sequence length (#words) after WordPiece tokenization. Sequences longer than this will be truncated, and shorter than this will be padded. bert_output_dropout: When not `None`, the probability will be used as the dropout probability for BERT output. name: name of Keras network. **kwargs: keyword arguments. """ super(TFRBertRankingNetwork, self).__init__(context_feature_columns=context_feature_columns, example_feature_columns=example_feature_columns, name=name, **kwargs) self._bert_config_file = bert_config_file self._bert_max_seq_length = bert_max_seq_length self._bert_output_dropout = bert_output_dropout bert_config = configs.BertConfig.from_json_file(self._bert_config_file) self._bert_encoder = tfmodel_networks.TransformerEncoder( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=activations.gelu, dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=self._bert_max_seq_length, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) self._dropout_layer = tf.keras.layers.Dropout( rate=self._bert_output_dropout) self._score_layer = tf.keras.layers.Dense(units=1, name="score")
def test_electra_pretrainer(self): """Validate that the Keras object can be created.""" # Build a transformer network to use within the ELECTRA trainer. vocab_size = 100 sequence_length = 512 test_generator_network = networks.TransformerEncoder( <<<<<<< HEAD vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length) test_discriminator_network = networks.TransformerEncoder( vocab_size=vocab_size, num_layers=2, sequence_length=sequence_length)
def test_electra_trainer_tensor_call(self): """Validate that the Keras object can be invoked.""" # Build a transformer network to use within the ELECTRA trainer. (Here, we # use a short sequence_length for convenience.) test_generator_network = networks.TransformerEncoder(vocab_size=100, num_layers=4, sequence_length=3) test_discriminator_network = networks.TransformerEncoder( vocab_size=100, num_layers=4, sequence_length=3) # Create a ELECTRA trainer with the created network. eletrca_trainer_model = electra_pretrainer.ElectraPretrainer( generator_network=test_generator_network, discriminator_network=test_discriminator_network, vocab_size=100, num_classes=2, sequence_length=3, last_hidden_dim=768, num_token_predictions=2) # Create a set of 2-dimensional data tensors to feed into the model. word_ids = tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.int32) mask = tf.constant([[1, 1, 1], [1, 0, 0]], dtype=tf.int32) type_ids = tf.constant([[1, 1, 1], [2, 2, 2]], dtype=tf.int32) lm_positions = tf.constant([[0, 1], [0, 2]], dtype=tf.int32) lm_ids = tf.constant([[10, 20], [20, 30]], dtype=tf.int32) inputs = { 'input_word_ids': word_ids, 'input_mask': mask, 'input_type_ids': type_ids, 'masked_lm_positions': lm_positions, 'masked_lm_ids': lm_ids } # Invoke the trainer model on the tensors. In Eager mode, this does the # actual calculation. (We can't validate the outputs, since the network is # too complex: this simply ensures we're not hitting runtime errors.) _, _, _, _ = eletrca_trainer_model(inputs)
def instantiate_encoder_from_cfg( config: TransformerEncoderConfig) -> networks.TransformerEncoder: """Instantiate a Transformer encoder network from TransformerEncoderConfig.""" encoder_network = networks.TransformerEncoder( vocab_size=config.vocab_size, hidden_size=config.hidden_size, num_layers=config.num_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, activation=tf_utils.get_activation(config.hidden_activation), dropout_rate=config.dropout_rate, attention_dropout_rate=config.attention_dropout_rate, max_sequence_length=config.max_position_embeddings, type_vocab_size=config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=config.initializer_range)) return encoder_network
def test_bert_trainer_named_compilation(self): """Validate compilation using explicit output names.""" # Build a transformer network to use within the BERT trainer. vocab_size = 100 test_network = networks.TransformerEncoder(vocab_size=vocab_size, num_layers=2) # Create a BERT trainer with the created network. bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network) # Attempt to compile the model using a string-keyed dict of output names to # loss functions. This will validate that the outputs are named as we # expect. bert_trainer_model.compile(optimizer='sgd', loss={ 'start_positions': 'mse', 'end_positions': 'mse' })
def test_dual_encoder(self, hidden_size, output): """Validate that the Keras object can be created.""" # Build a transformer network to use within the dual encoder model. vocab_size = 100 sequence_length = 512 test_network = networks.TransformerEncoder( vocab_size=vocab_size, num_layers=2, hidden_size=hidden_size, sequence_length=sequence_length) # Create a dual encoder model with the created network. dual_encoder_model = dual_encoder.DualEncoder( test_network, max_seq_length=sequence_length, output=output) # Create a set of 2-dimensional inputs (the first dimension is implicit). left_word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) left_mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) left_type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) right_word_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) right_mask = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) right_type_ids = tf.keras.Input(shape=(sequence_length, ), dtype=tf.int32) if output == 'logits': outputs = dual_encoder_model([ left_word_ids, left_mask, left_type_ids, right_word_ids, right_mask, right_type_ids ]) left_encoded, _ = outputs elif output == 'predictions': left_encoded = dual_encoder_model( [left_word_ids, left_mask, left_type_ids]) # Validate that the outputs are of the expected shape. expected_encoding_shape = [None, 768] self.assertAllEqual(expected_encoding_shape, left_encoded.shape.as_list())
def test_bert_trainer_tensor_call(self): """Validate that the Keras object can be invoked.""" # Build a transformer network to use within the BERT trainer. (Here, we use # a short sequence_length for convenience.) test_network = networks.TransformerEncoder(vocab_size=100, num_layers=2) # Create a BERT trainer with the created network. bert_trainer_model = bert_span_labeler.BertSpanLabeler(test_network) # Create a set of 2-dimensional data tensors to feed into the model. word_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32) mask = tf.constant([[1, 1], [1, 0]], dtype=tf.int32) type_ids = tf.constant([[1, 1], [2, 2]], dtype=tf.int32) # Invoke the trainer model on the tensors. In Eager mode, this does the # actual calculation. (We can't validate the outputs, since the network is # too complex: this simply ensures we're not hitting runtime errors.) _ = bert_trainer_model([word_ids, mask, type_ids])
def test_serialize_deserialize(self): """Validate that the dual encoder model can be serialized / deserialized.""" # Build a transformer network to use within the dual encoder model. (Here, # we use a short sequence_length for convenience.) sequence_length = 32 test_network = networks.TransformerEncoder( vocab_size=100, num_layers=2, sequence_length=sequence_length) # Create a dual encoder model with the created network. (Note that all the # args are different, so we can catch any serialization mismatches.) dual_encoder_model = dual_encoder.DualEncoder( test_network, max_seq_length=sequence_length, output='predictions') # Create another dual encoder model via serialization and deserialization. config = dual_encoder_model.get_config() new_dual_encoder = dual_encoder.DualEncoder.from_config(config) # Validate that the config can be forced to JSON. _ = new_dual_encoder.to_json() # If the serialization was successful, the new config should match the old. self.assertAllEqual(dual_encoder_model.get_config(), new_dual_encoder.get_config())
def _create_bert_model(cfg): """Creates a BERT keras core model from BERT configuration. Args: cfg: A `BertConfig` to create the core model. Returns: A keras model. """ bert_encoder = networks.TransformerEncoder( vocab_size=cfg.vocab_size, hidden_size=cfg.hidden_size, num_layers=cfg.num_hidden_layers, num_attention_heads=cfg.num_attention_heads, intermediate_size=cfg.intermediate_size, activation=activations.gelu, dropout_rate=cfg.hidden_dropout_prob, attention_dropout_rate=cfg.attention_probs_dropout_prob, sequence_length=cfg.max_position_embeddings, type_vocab_size=cfg.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=cfg.initializer_range)) return bert_encoder
attention_dropout_rate=bert_config.attention_probs_dropout_prob, <<<<<<< HEAD sequence_length=sequence_length, ======= >>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36 max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, embedding_width=bert_config.embedding_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) if isinstance(bert_config, albert_configs.AlbertConfig): return networks.AlbertTransformerEncoder(**kwargs) else: assert isinstance(bert_config, configs.BertConfig) kwargs['output_range'] = output_range return networks.TransformerEncoder(**kwargs) def pretrain_model(bert_config, seq_length, max_predictions_per_seq, initializer=None, use_next_sentence_label=True, return_core_pretrainer_model=False): """Returns model to be used for pre-training. Args: bert_config: Configuration that defines the core BERT model. seq_length: Maximum sequence length of the training data. max_predictions_per_seq: Maximum number of tokens in sequence to mask out and use for pretraining.
def classifier_model(bert_config, float_type, num_labels, max_seq_length, final_layer_initializer=None, hub_module_url=None): """BERT classifier model in functional API style. Construct a Keras model for predicting `num_labels` outputs from an input with maximum sequence length `max_seq_length`. Args: bert_config: BertConfig, the config defines the core BERT model. float_type: dtype, tf.float32 or tf.bfloat16. num_labels: integer, the number of classes. max_seq_length: integer, the maximum input sequence length. final_layer_initializer: Initializer for final dense layer. Defaulted TruncatedNormal initializer. hub_module_url: TF-Hub path/url to Bert module. Returns: Combined prediction model (words, mask, type) -> (one-hot labels) BERT sub-model (words, mask, type) -> (bert_outputs) """ if final_layer_initializer is not None: initializer = final_layer_initializer else: initializer = tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range) if not hub_module_url: bert_encoder = networks.TransformerEncoder( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation('gelu'), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=max_seq_length, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) return bert_classifier.BertClassifier( bert_encoder, num_classes=num_labels, dropout_rate=bert_config.hidden_dropout_prob, initializer=initializer), bert_encoder input_word_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_word_ids') input_mask = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_mask') input_type_ids = tf.keras.layers.Input(shape=(max_seq_length, ), dtype=tf.int32, name='input_type_ids') bert_model = hub.KerasLayer(hub_module_url, trainable=True) pooled_output, _ = bert_model([input_word_ids, input_mask, input_type_ids]) output = tf.keras.layers.Dropout( rate=bert_config.hidden_dropout_prob)(pooled_output) output = tf.keras.layers.Dense(num_labels, kernel_initializer=initializer, name='output', dtype=float_type)(output) return tf.keras.Model(inputs={ 'input_word_ids': input_word_ids, 'input_mask': input_mask, 'input_type_ids': input_type_ids }, outputs=output), bert_model