def setUp(self): super(FelixModelsTest, self).setUp() self._bert_test_config = bert_configs.BertConfig( attention_probs_dropout_prob=0.0, hidden_act='gelu', hidden_dropout_prob=0.0, hidden_size=16, initializer_range=0.02, intermediate_size=32, max_position_embeddings=128, num_attention_heads=2, num_hidden_layers=2, type_vocab_size=2, vocab_size=30522) self._bert_test_config.num_classes = 20 self._bert_test_config.query_size = 23 self._bert_test_config.query_transformer = True
def setUp(self): super().setUp() self.random_seed = 42 self.num_classes = 10 self.batch_size = 4 self.seq_length = 4 self.hidden_dim = 8 self.num_heads = 2 self.key_dim = self.hidden_dim // self.num_heads self.bert_test_config = bert_configs.BertConfig( attention_probs_dropout_prob=0.12, hidden_dropout_prob=0.34, hidden_act='gelu', hidden_size=self.hidden_dim, initializer_range=0.02, intermediate_size=self.hidden_dim, max_position_embeddings=self.seq_length, num_attention_heads=self.num_heads, num_hidden_layers=2, type_vocab_size=2, vocab_size=128) self.input_shape_3d = tf.TensorShape( (self.batch_size, self.seq_length, self.hidden_dim)) self.input_shape_4d = tf.TensorShape( (self.batch_size, self.seq_length, self.num_heads, self.key_dim)) # Layer arguments. self.sn_norm_multiplier = 0.05 self.spec_norm_kwargs = dict(iteration=1000, norm_multiplier=self.sn_norm_multiplier) self.attention_kwargs = dict(num_heads=self.num_heads, key_dim=self.key_dim) self.feedforward_kwargs = dict(intermediate_size=128, intermediate_activation='gelu', dropout=0.1, use_layer_norm=True) self.gp_layer_kwargs = dict(num_inducing=32, gp_cov_momentum=0.999, gp_cov_ridge_penalty=1e-6)
def _export_bert_tfhub(self): bert_config = configs.BertConfig(vocab_size=30522, hidden_size=16, intermediate_size=32, max_position_embeddings=128, num_attention_heads=2, num_hidden_layers=1) _, encoder = export_tfhub.create_bert_model(bert_config) model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint") checkpoint = tf.train.Checkpoint(model=encoder) checkpoint.save(os.path.join(model_checkpoint_dir, "test")) model_checkpoint_path = tf.train.latest_checkpoint( model_checkpoint_dir) vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt") with tf.io.gfile.GFile(vocab_file, "w") as f: f.write("dummy content") hub_destination = os.path.join(self.get_temp_dir(), "hub") export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path, hub_destination, vocab_file) return hub_destination
def __init__( self, uri='https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', model_dir=None, seq_len=128, dropout_rate=0.1, initializer_range=0.02, learning_rate=3e-5, distribution_strategy='mirrored', num_gpus=-1, tpu='', trainable=True, do_lower_case=True, is_tf2=True, convert_from_saved_model_tf2=False): """Initialze an instance with model paramaters. Args: uri: TF-Hub path/url to Bert module. model_dir: The location of the model checkpoint files. seq_len: Length of the sequence to feed into the model. dropout_rate: The rate for dropout. initializer_range: The stdev of the truncated_normal_initializer for initializing all weight matrices. learning_rate: The initial learning rate for Adam. distribution_strategy: A string specifying which distribution strategy to use. Accepted values are 'off', 'one_device', 'mirrored', 'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive. 'off' means not to use Distribution Strategy; 'tpu' means to use TPUStrategy using `tpu_address`. num_gpus: How many GPUs to use at each worker with the DistributionStrategies API. The default is -1, which means utilize all available GPUs. tpu: TPU address to connect to. trainable: boolean, whether pretrain layer is trainable. do_lower_case: boolean, whether to lower case the input text. Should be True for uncased models and False for cased models. is_tf2: boolean, whether the hub module is in TensorFlow 2.x format. convert_from_saved_model_tf2: Convert to TFLite from saved_model in TF 2.x. """ if compat.get_tf_behavior() not in self.compat_tf_versions: raise ValueError('Incompatible versions. Expect {}, but got {}.'.format( self.compat_tf_versions, compat.get_tf_behavior())) self.seq_len = seq_len self.dropout_rate = dropout_rate self.initializer_range = initializer_range self.learning_rate = learning_rate self.trainable = trainable self.model_dir = model_dir if self.model_dir is None: self.model_dir = tempfile.mkdtemp() num_gpus = get_num_gpus(num_gpus) self.strategy = distribution_utils.get_distribution_strategy( distribution_strategy=distribution_strategy, num_gpus=num_gpus, tpu_address=tpu) self.tpu = tpu self.uri = uri self.do_lower_case = do_lower_case self.is_tf2 = is_tf2 self.bert_config = bert_configs.BertConfig( 0, initializer_range=self.initializer_range, hidden_dropout_prob=self.dropout_rate) self.convert_from_saved_model_tf2 = convert_from_saved_model_tf2 self.is_built = False
def test_export_tfhub(self): # Exports a savedmodel for TF-Hub hidden_size = 16 bert_config = configs.BertConfig(vocab_size=100, hidden_size=hidden_size, intermediate_size=32, max_position_embeddings=128, num_attention_heads=2, num_hidden_layers=1) bert_model, encoder = export_tfhub.create_bert_model(bert_config) model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint") checkpoint = tf.train.Checkpoint(model=encoder) checkpoint.save(os.path.join(model_checkpoint_dir, "test")) model_checkpoint_path = tf.train.latest_checkpoint( model_checkpoint_dir) vocab_file = os.path.join(self.get_temp_dir(), "uncased_vocab.txt") with tf.io.gfile.GFile(vocab_file, "w") as f: f.write("dummy content") hub_destination = os.path.join(self.get_temp_dir(), "hub") export_tfhub.export_bert_tfhub(bert_config, model_checkpoint_path, hub_destination, vocab_file) # Restores a hub KerasLayer. hub_layer = hub.KerasLayer(hub_destination, trainable=True) if hasattr(hub_layer, "resolved_object"): # Checks meta attributes. self.assertTrue(hub_layer.resolved_object.do_lower_case.numpy()) with tf.io.gfile.GFile(hub_layer.resolved_object.vocab_file. asset_path.numpy()) as f: self.assertEqual("dummy content", f.read()) # Checks the hub KerasLayer. for source_weight, hub_weight in zip(bert_model.trainable_weights, hub_layer.trainable_weights): self.assertAllClose(source_weight.numpy(), hub_weight.numpy()) seq_length = 10 dummy_ids = np.zeros((2, seq_length), dtype=np.int32) hub_outputs = hub_layer([dummy_ids, dummy_ids, dummy_ids]) source_outputs = bert_model([dummy_ids, dummy_ids, dummy_ids]) # The outputs of hub module are "pooled_output" and "sequence_output", # while the outputs of encoder is in reversed order, i.e., # "sequence_output" and "pooled_output". encoder_outputs = reversed(encoder([dummy_ids, dummy_ids, dummy_ids])) self.assertEqual(hub_outputs[0].shape, (2, hidden_size)) self.assertEqual(hub_outputs[1].shape, (2, seq_length, hidden_size)) for source_output, hub_output, encoder_output in zip( source_outputs, hub_outputs, encoder_outputs): self.assertAllClose(source_output.numpy(), hub_output.numpy()) self.assertAllClose(source_output.numpy(), encoder_output.numpy()) # Test that training=True makes a difference (activates dropout). def _dropout_mean_stddev(training, num_runs=20): input_ids = np.array([[14, 12, 42, 95, 99]], np.int32) inputs = [ input_ids, np.ones_like(input_ids), np.zeros_like(input_ids) ] outputs = np.concatenate([ hub_layer(inputs, training=training)[0] for _ in range(num_runs) ]) return np.mean(np.std(outputs, axis=0)) self.assertLess(_dropout_mean_stddev(training=False), 1e-6) self.assertGreater(_dropout_mean_stddev(training=True), 1e-3) # Test propagation of seq_length in shape inference. input_word_ids = tf.keras.layers.Input(shape=(seq_length, ), dtype=tf.int32) input_mask = tf.keras.layers.Input(shape=(seq_length, ), dtype=tf.int32) input_type_ids = tf.keras.layers.Input(shape=(seq_length, ), dtype=tf.int32) pooled_output, sequence_output = hub_layer( [input_word_ids, input_mask, input_type_ids]) self.assertEqual(pooled_output.shape.as_list(), [None, hidden_size]) self.assertEqual(sequence_output.shape.as_list(), [None, seq_length, hidden_size])
def create_config(config_dir: str) -> configs.BertConfig: """Load a BERT config object from directory.""" with tf.io.gfile.GFile(config_dir) as config_file: bert_config = json.load(config_file) return configs.BertConfig(**bert_config)
def run_classifier(self, train_input_fn, validation_input_fn, epochs, steps_per_epoch, validation_steps, num_classes): """Creates classifier and runs the classifier training.""" if epochs is None: epochs = self.default_training_epochs bert_config = bert_configs.BertConfig( 0, initializer_range=self.initializer_range, hidden_dropout_prob=self.dropout_rate) warmup_steps = int(epochs * steps_per_epoch * 0.1) initial_lr = self.learning_rate def _get_classifier_model(): """Gets a classifier model.""" classifier_model, core_model = (bert_models.classifier_model( bert_config, num_classes, self.seq_len, hub_module_url=self.uri)) classifier_model.optimizer = optimization.create_optimizer( initial_lr, steps_per_epoch * epochs, warmup_steps) return classifier_model, core_model # During distributed training, loss used for gradient computation is # summed over from all replicas. When Keras compile/fit() API is used, # the fit() API internally normalizes the loss by dividing the loss by # the number of replicas used for computation. However, when custom # training loop is used this is not done automatically and should be # done manually by the end user. loss_multiplier = 1.0 if self.scale_loss: loss_multiplier = 1.0 / self.strategy.num_replicas_in_sync loss_fn = self.get_classification_loss_fn(num_classes, loss_factor=loss_multiplier) # Defines evaluation metrics function, which will create metrics in the # correct device and strategy scope. def metric_fn(): return tf.keras.metrics.SparseCategoricalAccuracy('test_accuracy', dtype=tf.float32) # Use user-defined loop to start training. tf.compat.v1.logging.info( 'Training using customized training loop TF 2.0 ' 'with distribution strategy.') bert_model = model_training_utils.run_customized_training_loop( strategy=self.strategy, model_fn=_get_classifier_model, loss_fn=loss_fn, model_dir=self.model_dir, steps_per_epoch=steps_per_epoch, steps_per_loop=self.steps_per_loop, epochs=epochs, train_input_fn=train_input_fn, eval_input_fn=validation_input_fn, eval_steps=validation_steps, init_checkpoint=None, metric_fn=metric_fn, custom_callbacks=None, run_eagerly=False) # Used in evaluation. with self.strategy.scope(): bert_model, _ = _get_classifier_model() checkpoint_path = tf.train.latest_checkpoint(self.model_dir) checkpoint = tf.train.Checkpoint(model=bert_model) checkpoint.restore(checkpoint_path).expect_partial() bert_model.compile(loss=loss_fn, metrics=[metric_fn()]) return bert_model
def test_forward_pass(self, use_pointing=False, query_transformer=False, is_training=True): """Randomly generate and run different configuarations for Felix Tagger.""" # Ensures reproducibility. # Setup. sequence_length = 7 vocab_size = 11 bert_hidden_size = 13 bert_num_hidden_layers = 1 bert_num_attention_heads = 1 bert_intermediate_size = 4 bert_type_vocab_size = 2 bert_max_position_embeddings = sequence_length bert_encoder = networks.BertEncoder( vocab_size=vocab_size, hidden_size=bert_hidden_size, num_layers=bert_num_hidden_layers, num_attention_heads=bert_num_attention_heads, intermediate_size=bert_intermediate_size, sequence_length=sequence_length, max_sequence_length=bert_max_position_embeddings, type_vocab_size=bert_type_vocab_size) bert_config = configs.BertConfig( vocab_size, hidden_size=bert_hidden_size, num_hidden_layers=bert_num_hidden_layers, num_attention_heads=bert_num_attention_heads, intermediate_size=bert_intermediate_size, type_vocab_size=bert_type_vocab_size, max_position_embeddings=bert_max_position_embeddings) batch_size = 17 edit_tags_size = 19 bert_config.num_classes = edit_tags_size bert_config.query_size = 23 bert_config.query_transformer = query_transformer tagger = felix_tagger.FelixTagger(bert_encoder, bert_config=bert_config, seq_length=sequence_length, use_pointing=use_pointing, is_training=is_training) # Create inputs. np.random.seed(42) input_word_ids = np.random.randint(vocab_size - 1, size=(batch_size, sequence_length)) input_mask = np.random.randint(1, size=(batch_size, sequence_length)) input_type_ids = np.ones((batch_size, sequence_length)) edit_tags = np.random.randint(edit_tags_size - 2, size=(batch_size, sequence_length)) # Run the model. if is_training: output = tagger( [input_word_ids, input_type_ids, input_mask, edit_tags]) else: output = tagger([input_word_ids, input_type_ids, input_mask]) # Check output shapes. if use_pointing: tag_logits, pointing_logits = output self.assertEqual(pointing_logits.shape, (batch_size, sequence_length, sequence_length)) else: tag_logits = output[0] self.assertEqual(tag_logits.shape, (batch_size, sequence_length, edit_tags_size))
def __init__(self, config: model_config.VanillaLinearVAECellConfig): self._gumbel_softmax_label_adjustment_multiplier = config.gumbel_softmax_label_adjustment_multiplier vocab_embeddings_initializer = None if config.shared_bert_embedding: shared_embedding_layer = _BERT( config.max_seq_length, bert_config=configs.BertConfig( **config.shared_bert_embedding_config), trainable=config.trainable_embedding) else: # If word_embedding_path is specified, use the embedding size of the # pre-trained embeddings. if config.word_embedding_path: with tf.io.gfile.GFile(config.word_embedding_path, 'rb') as embedding_file: word_embedding = np.load(embedding_file) embedding_vocab_size, embed_size = word_embedding.shape if config.vocab_size != embedding_vocab_size: raise ValueError( 'Expected consistent vocab size between vocab.txt and the ' 'embedding, found {} and {}.'.format( embedding_vocab_size, config.vocab_size)) config.embed_size = embed_size vocab_embeddings_initializer = ( tf.keras.initializers.Constant(word_embedding)) if config.shared_embedding: shared_embedding_layer = _Embedding( INPUT_ID_NAME, config.vocab_size, config.embed_size, embeddings_initializer=vocab_embeddings_initializer, input_length=config.max_seq_length, trainable=config.trainable_embedding) else: shared_embedding_layer = None encoder = DualRNNEncoder( vocab_size=config.vocab_size, embed_size=config.embed_size, max_seq_length=config.max_seq_length, hidden_size=config.encoder_hidden_size, num_layers=config.num_ecnoder_rnn_layers, dropout=config.dropout, cell_type=config.encoder_cell_type, embeddings_initializer=vocab_embeddings_initializer, shared_embedding_layer=shared_embedding_layer, trainable_embedding=config.trainable_embedding) sampler = utils.GumbelSoftmaxSampler(config.temperature, hard=False) decoder = DualRNNDecoder( vocab_size=config.vocab_size, embed_size=config.embed_size, max_seq_length=config.max_seq_length - 1, hidden_size=config.decoder_hidden_size, # Hardcoded to be 1 layer to align with pytorch version. Otherwise, we # need to define the initial state for each layer in # _prepare_decoder_initial_state and change _post_process_decoder_state num_layers=1, dropout=config.dropout, cell_type=config.decoder_cell_type, embeddings_initializer=vocab_embeddings_initializer, shared_embedding_layer=shared_embedding_layer, trainable_embedding=config.trainable_embedding, return_state=True) state_updater = _VanillaStateUpdater(config.state_updater_cell_type, config.num_states, config.dropout) self.encoder_output_projector = _VanillaEncoderOutputProjector( hidden_sizes=list(config.encoder_projection_sizes), output_size=config.num_states, dropout=config.dropout) self.sample_post_processor = utils.MLP( config.sampler_post_processor_output_sizes, dropout=config.dropout) self.shared_embedding_layer = shared_embedding_layer super(VanillaLinearVAECell, self).__init__(encoder=encoder, sampler=sampler, decoder=decoder, state_updater=state_updater)