def instantiate_from_cfg(config: BertPretrainerConfig, encoder_network: Optional[tf.keras.Model] = None): """Instantiates a BertPretrainer from the config.""" encoder_cfg = config.encoder if encoder_network is None: encoder_network = networks.TransformerEncoder( vocab_size=encoder_cfg.vocab_size, hidden_size=encoder_cfg.hidden_size, num_layers=encoder_cfg.num_layers, num_attention_heads=encoder_cfg.num_attention_heads, intermediate_size=encoder_cfg.intermediate_size, activation=tf_utils.get_activation(encoder_cfg.hidden_activation), dropout_rate=encoder_cfg.dropout_rate, attention_dropout_rate=encoder_cfg.attention_dropout_rate, max_sequence_length=encoder_cfg.max_position_embeddings, type_vocab_size=encoder_cfg.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=encoder_cfg.initializer_range)) if config.cls_heads: classification_heads = [ layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads ] else: classification_heads = [] return bert_pretrainer.BertPretrainerV2( config.num_masked_tokens, mlm_initializer=tf.keras.initializers.TruncatedNormal( stddev=encoder_cfg.initializer_range), encoder_network=encoder_network, classification_heads=classification_heads)
def _build_pretrainer( config: electra.ElectraPretrainerConfig) -> models.ElectraPretrainer: """Instantiates ElectraPretrainer from the config.""" generator_encoder_cfg = config.generator_encoder discriminator_encoder_cfg = config.discriminator_encoder # Copy discriminator's embeddings to generator for easier model serialization. discriminator_network = encoders.build_encoder(discriminator_encoder_cfg) if config.tie_embeddings: embedding_layer = discriminator_network.get_embedding_layer() generator_network = encoders.build_encoder( generator_encoder_cfg, embedding_layer=embedding_layer) else: generator_network = encoders.build_encoder(generator_encoder_cfg) generator_encoder_cfg = generator_encoder_cfg.get() return models.ElectraPretrainer( generator_network=generator_network, discriminator_network=discriminator_network, vocab_size=generator_encoder_cfg.vocab_size, num_classes=config.num_classes, sequence_length=config.sequence_length, num_token_predictions=config.num_masked_tokens, mlm_activation=tf_utils.get_activation( generator_encoder_cfg.hidden_activation), mlm_initializer=tf.keras.initializers.TruncatedNormal( stddev=generator_encoder_cfg.initializer_range), classification_heads=[ layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads ], disallow_correct=config.disallow_correct)
def __init__( self, network: Union[tf.keras.layers.Layer, tf.keras.Model], num_classes: int, initializer: tf.keras.initializers.Initializer = 'random_normal', summary_type: str = 'last', dropout_rate: float = 0.1, **kwargs): super().__init__(**kwargs) self._network = network self._initializer = initializer self._summary_type = summary_type self._num_classes = num_classes self._config = { 'network': network, 'initializer': initializer, 'num_classes': num_classes, 'summary_type': summary_type, 'dropout_rate': dropout_rate, } if summary_type == 'last': cls_token_idx = -1 elif summary_type == 'first': cls_token_idx = 0 else: raise ValueError('Invalid summary type provided: %s.' % summary_type) self.classifier = layers.ClassificationHead( inner_dim=network.get_config()['inner_size'], num_classes=num_classes, initializer=initializer, dropout_rate=dropout_rate, cls_token_idx=cls_token_idx, name='sentence_prediction')
def build_small_model(self, model_cfg): encoder_cfg = model_cfg['encoder']['bert'] dataconf = self.task_config.train_data encoder_network = small_encoder_lib.TransformerEncoder( vocab_size=encoder_cfg['vocab_size'], hidden_size=encoder_cfg['hidden_size'], num_layers=encoder_cfg['num_layers'], num_attention_heads=encoder_cfg['num_attention_heads'], intermediate_size=encoder_cfg['intermediate_size'], activation=tf_utils.get_activation( encoder_cfg['hidden_activation']), dropout_rate=encoder_cfg['dropout_rate'], attention_dropout_rate=encoder_cfg['attention_dropout_rate'], max_sequence_length=encoder_cfg['max_position_embeddings'], type_vocab_size=encoder_cfg['type_vocab_size'], initializer=tf.keras.initializers.TruncatedNormal( stddev=encoder_cfg['initializer_range']), net2net_ratio=encoder_cfg['net2net_ratio'], net2net_layers=encoder_cfg['net2net_layers'], lightatt_layers=encoder_cfg['lightatt_layers'], input_pool_name=encoder_cfg['input_pool_name'], input_pool_size=encoder_cfg['input_pool_size']) sequence_length = dataconf.seq_length predict_length = dataconf.max_predictions_per_seq dummy_inputs = dict(input_mask=tf.zeros((1, sequence_length), dtype=tf.int32), input_positions=tf.zeros((1, sequence_length), dtype=tf.int32), input_type_ids=tf.zeros((1, sequence_length), dtype=tf.int32), input_word_ids=tf.zeros((1, sequence_length), dtype=tf.int32), masked_lm_positions=tf.zeros((1, predict_length), dtype=tf.int32), masked_input_ids=tf.zeros((1, predict_length), dtype=tf.int32), masked_segment_ids=tf.zeros((1, predict_length), dtype=tf.int32), masked_lm_weights=tf.zeros((1, predict_length), dtype=tf.float32)) _ = encoder_network(dummy_inputs) if 'cls_heads' in model_cfg: classification_heads = [ layers.ClassificationHead(**cfg) for cfg in model_cfg['cls_heads'] ] else: classification_heads = [] model = small_pretrainer.BertPretrainModel( mlm_initializer=tf.keras.initializers.TruncatedNormal( stddev=encoder_cfg['initializer_range']), mlm_activation=tf_utils.get_activation( encoder_cfg['hidden_activation']), encoder_network=encoder_network, classification_heads=classification_heads) _ = model(dummy_inputs) return model
def instantiate_classification_heads_from_cfgs( cls_head_configs: List[bert.ClsHeadConfig] ) -> List[layers.ClassificationHead]: if cls_head_configs: return [ layers.ClassificationHead(**cfg.as_dict()) for cfg in cls_head_configs ] else: return []
def __init__(self, generator_network, discriminator_network, vocab_size, num_classes, sequence_length, num_token_predictions, mlm_activation=None, mlm_initializer='glorot_uniform', output_type='logits', disallow_correct=False, **kwargs): super(ElectraPretrainer, self).__init__() self._config = { 'generator_network': generator_network, 'discriminator_network': discriminator_network, 'vocab_size': vocab_size, 'num_classes': num_classes, 'sequence_length': sequence_length, 'num_token_predictions': num_token_predictions, 'mlm_activation': mlm_activation, 'mlm_initializer': mlm_initializer, 'output_type': output_type, 'disallow_correct': disallow_correct, } for k, v in kwargs.items(): self._config[k] = v self.generator_network = generator_network self.discriminator_network = discriminator_network self.vocab_size = vocab_size self.num_classes = num_classes self.sequence_length = sequence_length self.num_token_predictions = num_token_predictions self.mlm_activation = mlm_activation self.mlm_initializer = mlm_initializer self.output_type = output_type self.disallow_correct = disallow_correct self.masked_lm = layers.MaskedLM( embedding_table=generator_network.get_embedding_table(), activation=mlm_activation, initializer=mlm_initializer, output=output_type, name='generator_masked_lm') self.classification = layers.ClassificationHead( inner_dim=generator_network._config_dict['hidden_size'], num_classes=num_classes, initializer=mlm_initializer, name='generator_classification_head') self.discriminator_projection = tf.keras.layers.Dense( units=discriminator_network._config_dict['hidden_size'], activation=mlm_activation, kernel_initializer=mlm_initializer, name='discriminator_projection_head') self.discriminator_head = tf.keras.layers.Dense( units=1, kernel_initializer=mlm_initializer)
def __init__(self, network, num_classes, initializer='glorot_uniform', dropout_rate=0.1, use_encoder_pooler=True, **kwargs): self._self_setattr_tracking = False self._network = network self._config = { 'network': network, 'num_classes': num_classes, 'initializer': initializer, 'use_encoder_pooler': use_encoder_pooler, } # We want to use the inputs of the passed network as the inputs to this # Model. To do this, we need to keep a handle to the network inputs for use # when we construct the Model object at the end of init. inputs = network.inputs if use_encoder_pooler: # Because we have a copy of inputs to create this Model object, we can # invoke the Network object with its own input tensors to start the Model. outputs = network(inputs) if isinstance(outputs, list): cls_output = outputs[1] else: cls_output = outputs['pooled_output'] cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output) self.classifier = networks.Classification( input_width=cls_output.shape[-1], num_classes=num_classes, initializer=initializer, output='logits', name='sentence_prediction') predictions = self.classifier(cls_output) else: outputs = network(inputs) if isinstance(outputs, list): sequence_output = outputs[0] else: sequence_output = outputs['sequence_output'] self.classifier = layers.ClassificationHead( inner_dim=sequence_output.shape[-1], num_classes=num_classes, initializer=initializer, dropout_rate=dropout_rate, name='sentence_prediction') predictions = self.classifier(sequence_output) super(BertClassifier, self).__init__(inputs=inputs, outputs=predictions, **kwargs)
def test_copy_pooler_dense_to_encoder(self): encoder_config = encoders.EncoderConfig( type="bert", bert=encoders.BertEncoderConfig(hidden_size=24, intermediate_size=48, num_layers=2)) cls_heads = [ layers.ClassificationHead(inner_dim=24, num_classes=2, name="next_sentence") ] encoder = encoders.build_encoder(encoder_config) pretrainer = models.BertPretrainerV2( encoder_network=encoder, classification_heads=cls_heads, mlm_activation=tf_utils.get_activation( encoder_config.get().hidden_activation)) # Makes sure the pretrainer variables are created. _ = pretrainer(pretrainer.inputs) checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items) model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint") checkpoint.save(os.path.join(model_checkpoint_dir, "test")) vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy( self.get_temp_dir(), use_sp_model=True) export_path = os.path.join(self.get_temp_dir(), "hub") export_tfhub_lib.export_model( export_path=export_path, encoder_config=encoder_config, model_checkpoint_path=tf.train.latest_checkpoint( model_checkpoint_dir), with_mlm=True, copy_pooler_dense_to_encoder=True, vocab_file=vocab_file, sp_model_file=sp_model_file, do_lower_case=True) # Restores a hub KerasLayer. hub_layer = hub.KerasLayer(export_path, trainable=True) dummy_ids = np.zeros((2, 10), dtype=np.int32) input_dict = dict(input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) hub_pooled_output = hub_layer(input_dict)["pooled_output"] encoder_outputs = encoder(input_dict) # Verify that hub_layer's pooled_output is the same as the output of next # sentence prediction's dense layer. pretrained_pooled_output = cls_heads[0].dense( (encoder_outputs["sequence_output"][:, 0, :])) self.assertAllClose(hub_pooled_output, pretrained_pooled_output) # But the pooled_output between encoder and hub_layer are not the same. encoder_pooled_output = encoder_outputs["pooled_output"] self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
def build_model(self, params=None): config = params or self.task_config.model encoder_cfg = config.encoder encoder_network = self._build_encoder(encoder_cfg) cls_heads = [ layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads ] if config.cls_heads else [] return models.BertPretrainerV2( mlm_activation=tf_utils.get_activation(config.mlm_activation), mlm_initializer=tf.keras.initializers.TruncatedNormal( stddev=config.mlm_initializer_range), encoder_network=encoder_network, classification_heads=cls_heads)
def _build_pretrainer(self, pretrainer_cfg: bert.PretrainerConfig, name: str): """Builds pretrainer from config and encoder.""" encoder = encoders.build_encoder(pretrainer_cfg.encoder) if pretrainer_cfg.cls_heads: cls_heads = [ layers.ClassificationHead(**cfg.as_dict()) for cfg in pretrainer_cfg.cls_heads ] else: cls_heads = [] masked_lm = layers.MobileBertMaskedLM( embedding_table=encoder.get_embedding_table(), activation=tf_utils.get_activation(pretrainer_cfg.mlm_activation), initializer=tf.keras.initializers.TruncatedNormal( stddev=pretrainer_cfg.mlm_initializer_range), name='cls/predictions') pretrainer = models.BertPretrainerV2( encoder_network=encoder, classification_heads=cls_heads, customized_masked_lm=masked_lm, name=name) return pretrainer
def __init__(self, network, num_classes, initializer='glorot_uniform', dropout_rate=0.1, use_encoder_pooler=True, cls_head=None, **kwargs): self.num_classes = num_classes self.initializer = initializer self.use_encoder_pooler = use_encoder_pooler self.cls_head = cls_head # We want to use the inputs of the passed network as the inputs to this # Model. To do this, we need to keep a handle to the network inputs for use # when we construct the Model object at the end of init. inputs = network.inputs if use_encoder_pooler: # Because we have a copy of inputs to create this Model object, we can # invoke the Network object with its own input tensors to start the Model. outputs = network(inputs) if isinstance(outputs, list): cls_inputs = outputs[1] else: cls_inputs = outputs['pooled_output'] cls_inputs = tf.keras.layers.Dropout(rate=dropout_rate)(cls_inputs) else: outputs = network(inputs) if isinstance(outputs, list): cls_inputs = outputs[0] else: cls_inputs = outputs['sequence_output'] if cls_head: classifier = cls_head else: classifier = layers.ClassificationHead( inner_dim=0 if use_encoder_pooler else cls_inputs.shape[-1], num_classes=num_classes, initializer=initializer, dropout_rate=dropout_rate, name='sentence_prediction') predictions = classifier(cls_inputs) # b/164516224 # Once we've created the network using the Functional API, we call # super().__init__ as though we were invoking the Functional API Model # constructor, resulting in this object having all the properties of a model # created using the Functional API. Once super().__init__ is called, we # can assign attributes to `self` - note that all `self` assignments are # below this line. super(BertClassifier, self).__init__( inputs=inputs, outputs=predictions, **kwargs) self._network = network config_dict = self._make_config_dict() # We are storing the config dict as a namedtuple here to ensure checkpoint # compatibility with an earlier version of this model which did not track # the config dict attribute. TF does not track immutable attrs which # do not contain Trackables, so by creating a config namedtuple instead of # a dict we avoid tracking it. config_cls = collections.namedtuple('Config', config_dict.keys()) self._config = config_cls(**config_dict) self.classifier = classifier
def prepare_config(self, teacher_block_num, student_block_num, transfer_teacher_layers): # using small model for testing task_config = distillation.BertDistillationTaskConfig( teacher_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig( type='mobilebert', mobilebert=encoders.MobileBertEncoderConfig( num_blocks=teacher_block_num)), cls_heads=[ bert.ClsHeadConfig( inner_dim=256, num_classes=2, dropout_rate=0.1, name='next_sentence') ], mlm_activation='gelu'), student_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig( type='mobilebert', mobilebert=encoders.MobileBertEncoderConfig( num_blocks=student_block_num)), cls_heads=[ bert.ClsHeadConfig( inner_dim=256, num_classes=2, dropout_rate=0.1, name='next_sentence') ], mlm_activation='relu'), train_data=pretrain_dataloader.BertPretrainDataConfig( input_path='dummy', max_predictions_per_seq=76, seq_length=512, global_batch_size=10), validation_data=pretrain_dataloader.BertPretrainDataConfig( input_path='dummy', max_predictions_per_seq=76, seq_length=512, global_batch_size=10)) # set only 1 step for each stage progressive_config = distillation.BertDistillationProgressiveConfig() progressive_config.layer_wise_distill_config.transfer_teacher_layers = ( transfer_teacher_layers) progressive_config.layer_wise_distill_config.num_steps = 1 progressive_config.pretrain_distill_config.num_steps = 1 optimization_config = optimization.OptimizationConfig( optimizer=optimization.OptimizerConfig( type='lamb', lamb=optimization.LAMBConfig(weight_decay_rate=0.0001, exclude_from_weight_decay=[ 'LayerNorm', 'layer_norm', 'bias', 'no_norm' ])), learning_rate=optimization.LrConfig( type='polynomial', polynomial=optimization.PolynomialLrConfig( initial_learning_rate=1.5e-3, decay_steps=10000, end_learning_rate=1.5e-3)), warmup=optimization.WarmupConfig( type='linear', linear=optimization.LinearWarmupConfig( warmup_learning_rate=0))) exp_config = cfg.ExperimentConfig( task=task_config, trainer=prog_trainer_lib.ProgressiveTrainerConfig( progressive=progressive_config, optimizer_config=optimization_config)) # Create a teacher model checkpoint. teacher_encoder = encoders.build_encoder( task_config.teacher_model.encoder) pretrainer_config = task_config.teacher_model if pretrainer_config.cls_heads: teacher_cls_heads = [ layers.ClassificationHead(**cfg.as_dict()) for cfg in pretrainer_config.cls_heads ] else: teacher_cls_heads = [] masked_lm = layers.MobileBertMaskedLM( embedding_table=teacher_encoder.get_embedding_table(), activation=tf_utils.get_activation( pretrainer_config.mlm_activation), initializer=tf.keras.initializers.TruncatedNormal( stddev=pretrainer_config.mlm_initializer_range), name='cls/predictions') teacher_pretrainer = models.BertPretrainerV2( encoder_network=teacher_encoder, classification_heads=teacher_cls_heads, customized_masked_lm=masked_lm) # The model variables will be created after the forward call. _ = teacher_pretrainer(teacher_pretrainer.inputs) teacher_pretrainer_ckpt = tf.train.Checkpoint( **teacher_pretrainer.checkpoint_items) teacher_ckpt_path = os.path.join(self.get_temp_dir(), 'teacher_model.ckpt') teacher_pretrainer_ckpt.save(teacher_ckpt_path) exp_config.task.teacher_model_init_checkpoint = self.get_temp_dir() return exp_config
======= >>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36 self.num_token_predictions = num_token_predictions self.mlm_activation = mlm_activation self.mlm_initializer = mlm_initializer self.output_type = output_type self.disallow_correct = disallow_correct self.masked_lm = layers.MaskedLM( embedding_table=generator_network.get_embedding_table(), activation=mlm_activation, initializer=mlm_initializer, output=output_type, name='generator_masked_lm') self.classification = layers.ClassificationHead( inner_dim=generator_network._config_dict['hidden_size'], num_classes=num_classes, initializer=mlm_initializer, name='generator_classification_head') self.discriminator_projection = tf.keras.layers.Dense( units=discriminator_network._config_dict['hidden_size'], activation=mlm_activation, kernel_initializer=mlm_initializer, name='discriminator_projection_head') self.discriminator_head = tf.keras.layers.Dense( units=1, kernel_initializer=mlm_initializer) def call(self, inputs): """ELECTRA forward pass. Args: inputs: A dict of all inputs, same as the standard BERT model.