def __init__(self, params): """Constructor for BertModel. Args: params: `BigBirdConfig` dictionary. """ self.params = copy.deepcopy(params) self.scope = params["scope"] with tf.compat.v1.variable_scope(self.scope, reuse=tf.compat.v1.AUTO_REUSE) as vs: self.embeder = utils.EmbeddingLayer( vocab_size=self.params["vocab_size"], emb_dim=self.params["hidden_size"], initializer=utils.create_initializer( self.params["initializer_range"]), scale_emb=self.params["rescale_embedding"], use_token_type=True, num_token_types=self.params["type_vocab_size"], use_position_embeddings=True, max_position_embeddings=self.params["max_position_embeddings"], dropout_prob=self.params["hidden_dropout_prob"]) self.encoder = encoder.EncoderStack(self.params) self.pooler = tf.compat.v1.layers.Dense( units=self.params["hidden_size"], activation=tf.tanh, kernel_initializer=utils.create_initializer( self.params["initializer_range"]), name="pooler/dense") super(BertModel, self).__init__(name=self.scope, _scope=vs)
def __init__(self, attention_type, hidden_size=768, intermediate_size=3072, intermediate_act_fn=utils.gelu, attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.1, initializer_range=0.02, num_attention_heads=12, num_rand_blocks=3, block_size=64, use_bias=True, seed=None, name=None): """Constructor of an encoder layer of a transformer in Pegasus style. Args: attention_type: Type of attention, needs to be one of ['original_full', 'simulated_sparse', 'block_sparse']. hidden_size: (optional) int. Size of hidden dimension. intermediate_size: (optional) int. Size of intermediate dimension. intermediate_act_fn: optional) Activation function for intermediate layer. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. hidden_dropout_prob: (optional) float. Dropout probability of the attention. initializer_range: (optional) float. Range of the weight initializer. num_attention_heads: (optional) int. Number of attention heads. num_rand_blocks: (optional) int. Number of random chunks per row. block_size: (optional) int. size of block in sequence. use_bias: (optional) bool. Whether key/query/value uses a bias vector. seed: (Optional) int. Reandom seed for generating random mask. name: The name scope of this layer. """ super(PrenormEncoderLayer, self).__init__(name=name) self.hidden_dropout_prob = hidden_dropout_prob # Attention layer attention_head_size = hidden_size // num_attention_heads self.attn_layer = attention.MultiHeadedAttentionLayer( attention_type, num_attention_heads, num_rand_blocks, attention_head_size, initializer_range, block_size, block_size, attention_probs_dropout_prob, use_bias, seed, name="self") # Dense layers self.projection_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, utils.create_initializer(initializer_range), None, "dense", use_bias) self.expand_layer = utils.Dense2dLayer( intermediate_size, utils.create_initializer(initializer_range), intermediate_act_fn, "dense") self.contract_layer = utils.Dense2dLayer( hidden_size, utils.create_initializer(initializer_range), None, "dense") # Normalization layer self.first_layer_norm = utils.NormLayer() self.second_layer_norm = utils.NormLayer()
def __init__(self, params): """Constructor for BertModel. Args: params: `BigBirdConfig` dictionary. """ self.params = copy.deepcopy(params) self.scope = params["scope"] super(BertModel, self).__init__(name=self.scope) # validate params self.pad = lambda x: x if params["max_encoder_length"] <= 512: logging.info("Switching to full attention for short sequences") self.params["attention_type"] = "original_full" if self.params["attention_type"] == "simulated_sparse" or self.params[ "attention_type"] == "block_sparse": if params["max_encoder_length"] % params["block_size"]: logging.info( "Expand max_encoder_length to next multiple of block_size") self.params["max_encoder_length"] = ( params["max_encoder_length"] // params["block_size"] + 1) * params["block_size"] pad_size = self.params["max_encoder_length"] - params[ "max_encoder_length"] paddings = [[0, 0], [0, pad_size]] self.pad = lambda x: tf.pad(x, paddings) with tf.compat.v1.variable_scope(self.scope, reuse=tf.compat.v1.AUTO_REUSE): self.embeder = utils.EmbeddingLayer( vocab_size=self.params["vocab_size"], emb_dim=self.params["hidden_size"], initializer=utils.create_initializer( self.params["initializer_range"]), scale_emb=self.params["rescale_embedding"], use_token_type=True, num_token_types=self.params["type_vocab_size"], use_position_embeddings=True, max_position_embeddings=self.params["max_position_embeddings"], dropout_prob=self.params["hidden_dropout_prob"]) self.encoder = encoder.EncoderStack(self.params) self.pooler = utils.SimpleDenseLayer( input_size=self.params["hidden_size"], output_size=self.params["hidden_size"], initializer=utils.create_initializer( self.params["initializer_range"]), activation=tf.tanh, name="pooler/dense")
def __init__(self, params, input_ids, target_ids=None, training=None): """Constructor for TransformerModel. Args: params: `BigBirdConfig` dictionary. # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. """ self.params = copy.deepcopy(params) self.scope = params["scope"] with tf.compat.v1.variable_scope( self.scope, reuse=tf.compat.v1.AUTO_REUSE) as vs: self.embeder = utils.EmbeddingLayer( vocab_size=self.params["vocab_size"], emb_dim=self.params["hidden_size"], initializer=utils.create_initializer( self.params["initializer_range"]), scale_emb=self.params["rescale_embedding"], use_token_type=False, num_token_types=None, use_position_embeddings=True, max_position_embeddings=self.params["max_position_embeddings"], dropout_prob=self.params["hidden_dropout_prob"]) # encoder self.encoder = encoder.EncoderStack(self.params) self.encoder_output, encoder_mask = self._encode(input_ids, training) # decoder self.decoder = decoder.DecoderStack(self.params) self.predictions = self._decode_and_predict(target_ids, self.encoder_output, encoder_mask, training) super(TransformerModel, self).__init__(name=self.scope, _scope=vs)
def __init__(self, params): """Constructor for TransformerModel. Args: params: `BigBirdConfig` dictionary. """ self.params = copy.deepcopy(params) self.scope = params["scope"] with tf.compat.v1.variable_scope(self.scope, reuse=tf.compat.v1.AUTO_REUSE) as vs: self.embeder = utils.EmbeddingLayer( vocab_size=self.params["vocab_size"], emb_dim=self.params["hidden_size"], initializer=utils.create_initializer( self.params["initializer_range"]), scale_emb=self.params["rescale_embedding"], use_token_type=False, num_token_types=None, use_position_embeddings=True, max_position_embeddings=self.params["max_position_embeddings"], dropout_prob=self.params["hidden_dropout_prob"]) self.encoder = encoder.EncoderStack(self.params) self.decoder = decoder.DecoderStack(self.params) super(TransformerModel, self).__init__(name=self.scope, _scope=vs)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel(bert_config) masked_lm = MaskedLMLayer(bert_config["hidden_size"], bert_config["vocab_size"], model.embeder, initializer=utils.create_initializer( bert_config["initializer_range"]), activation_fn=utils.get_activation( bert_config["hidden_act"])) next_sentence = NSPLayer(bert_config["hidden_size"], initializer=utils.create_initializer( bert_config["initializer_range"])) sequence_output, pooled_output = model( features["input_ids"], training=is_training, token_type_ids=features.get("segment_ids")) masked_lm_loss, masked_lm_log_probs = masked_lm( sequence_output, label_ids=features.get("masked_lm_ids"), label_weights=features.get("masked_lm_weights"), masked_lm_positions=features.get("masked_lm_positions")) next_sentence_loss, next_sentence_log_probs = next_sentence( pooled_output, features.get("next_sentence_labels")) total_loss = masked_lm_loss if bert_config["use_nsp"]: total_loss += next_sentence_loss tvars = tf.compat.v1.trainable_variables() utils.log_variables(tvars, bert_config["ckpt_var_list"]) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: learning_rate = optimization.get_linear_warmup_linear_decay_lr( init_lr=bert_config["learning_rate"], num_train_steps=bert_config["num_train_steps"], num_warmup_steps=bert_config["num_warmup_steps"]) optimizer = optimization.get_optimizer(bert_config, learning_rate) global_step = tf.compat.v1.train.get_global_step() gradients = optimizer.compute_gradients(total_loss, tvars) train_op = optimizer.apply_gradients(gradients, global_step=global_step) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=utils.add_scalars_to_summary( bert_config["output_dir"], {"learning_rate": learning_rate})) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_loss_value, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_loss_value, next_sentence_log_probs, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_accuracy = tf.compat.v1.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.compat.v1.metrics.mean( values=masked_lm_loss_value) next_sentence_predictions = tf.argmax(next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_accuracy = tf.compat.v1.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.compat.v1.metrics.mean( values=next_sentence_loss_value) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } eval_metrics = (metric_fn, [ masked_lm_loss, masked_lm_log_probs, features["masked_lm_ids"], features["masked_lm_weights"], next_sentence_loss, next_sentence_log_probs, features["next_sentence_labels"] ]) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics) else: output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions={ "log-probabilities": masked_lm_log_probs, "seq-embeddings": sequence_output }) return output_spec
def __init__(self, hidden_size=768, intermediate_size=3072, intermediate_act_fn=utils.gelu, attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.1, initializer_range=0.02, num_attention_heads=12, use_bias=True, name=None): """Constructor of a decoder layer of a transformer in Pegasus style. Args: hidden_size: (optional) int. Size of hidden dimension. intermediate_size: (optional) int. Size of intermediate dimension. intermediate_act_fn: optional) Activation function for intermediate layer. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. hidden_dropout_prob: (optional) float. Dropout probability of the attention. initializer_range: (optional) float. Range of the weight initializer. num_attention_heads: (optional) int. Number of attention heads. use_bias: (optional) bool. Whether key/query/value uses a bias vector. name: The name scope of this layer. """ super(PrenormDecoderLayer, self).__init__(name=name) self.hidden_dropout_prob = hidden_dropout_prob # Attention layers attention_head_size = hidden_size // num_attention_heads self.self_attn_layer = attention.MultiHeadedAttentionLayer( "original_full", use_bias=use_bias, name="self", num_attention_heads=num_attention_heads, size_per_head=attention_head_size, initializer_range=initializer_range, attention_probs_dropout_prob=attention_probs_dropout_prob) self.cross_attn_layer = attention.MultiHeadedAttentionLayer( "original_full", use_bias=use_bias, name="encdec", num_attention_heads=num_attention_heads, size_per_head=attention_head_size, initializer_range=initializer_range, attention_probs_dropout_prob=attention_probs_dropout_prob) # Dense layers self.self_proj_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, utils.create_initializer(initializer_range), None, "dense", use_bias) self.cross_proj_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, utils.create_initializer(initializer_range), None, "dense", use_bias) self.expand_layer = utils.Dense2dLayer( intermediate_size, utils.create_initializer(initializer_range), intermediate_act_fn, "dense") self.contract_layer = utils.Dense2dLayer( hidden_size, utils.create_initializer(initializer_range), None, "dense") # Normalization layer self.first_layer_norm = utils.NormLayer() self.second_layer_norm = utils.NormLayer() self.third_layer_norm = utils.NormLayer()
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" if isinstance(features, dict): if not labels and "labels" in features: labels = features["labels"] features = features["input_ids"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel(bert_config) headl = ClassifierLossLayer( bert_config["num_labels"], bert_config["hidden_dropout_prob"], utils.create_initializer(bert_config["initializer_range"]), name=bert_config["scope"]+"/classifier") _, pooled_output = model(features, training=is_training) total_loss, log_probs = headl(pooled_output, labels, is_training) tvars = tf.compat.v1.trainable_variables() utils.log_variables(tvars, bert_config["ckpt_var_list"]) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: learning_rate = optimization.get_linear_warmup_linear_decay_lr( init_lr=bert_config["learning_rate"], num_train_steps=bert_config["num_train_steps"], num_warmup_steps=bert_config["num_warmup_steps"]) optimizer = optimization.get_optimizer(bert_config, learning_rate) global_step = tf.compat.v1.train.get_or_create_global_step() gradients = optimizer.compute_gradients(total_loss, tvars) train_op = optimizer.apply_gradients(gradients, global_step=global_step) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=utils.add_scalars_to_summary( bert_config["output_dir"], {"learning_rate": learning_rate})) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(loss_value, label_ids, log_probs): loss = tf.compat.v1.metrics.mean(values=loss_value) predictions = tf.argmax(log_probs, axis=-1, output_type=tf.int32) accuracy = tf.compat.v1.metrics.accuracy( labels=label_ids, predictions=predictions) p1, p1_op = tf.compat.v1.metrics.precision_at_k( labels=tf.cast(label_ids, tf.int64), predictions=log_probs, k=1) r1, r1_op = tf.compat.v1.metrics.recall_at_k( labels=tf.cast(label_ids, tf.int64), predictions=log_probs, k=1) f11 = tf.math.divide_no_nan(2*p1*r1, p1+r1) metric_dict = { "P@1": (p1, p1_op), "R@1": (r1, r1_op), "f1@1": (f11, tf.no_op()), "classification_accuracy": accuracy, "classification_loss": loss, } return metric_dict eval_metrics = (metric_fn, [tf.expand_dims(total_loss, 0), labels, log_probs]) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics) else: output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions={"log-probabilities": log_probs}) return output_spec
def __init__(self, attention_type, hidden_size=768, intermediate_size=3072, intermediate_act_fn=utils.gelu, attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.1, initializer_range=0.02, num_attention_heads=12, num_rand_blocks=3, block_size=64, use_bias=True, seed=None, name=None): """Constructor of an encoder layer of a transformer in BERT style. Args: attention_type: Type of attention, needs to be one of ['original_full', 'simulated_sparse', 'block_sparse']. hidden_size: (optional) int. Size of hidden dimension. intermediate_size: (optional) int. Size of intermediate dimension. intermediate_act_fn: optional) Activation function for intermediate layer. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. hidden_dropout_prob: (optional) float. Dropout probability of the attention. initializer_range: (optional) float. Range of the weight initializer. num_attention_heads: (optional) int. Number of attention heads. num_rand_blocks: (optional) int. Number of random chunks per row. block_size: (optional) int. size of block in sequence. use_bias: (optional) bool. Whether key/query/value uses a bias vector. seed: (Optional) int. Reandom seed for generating random mask. name: The name scope of this layer. """ super(PostnormEncoderLayer, self).__init__(name=name) self.hidden_dropout_prob = hidden_dropout_prob # Attention layer의 정 attention_head_size = hidden_size // num_attention_heads # 12 multi-head attention 을 위해서 head size를 정의 self.attn_layer = attention.MultiHeadedAttentionLayer( attention_type, num_attention_heads, num_rand_blocks, # block_sparse, 12, 3 attention_head_size, initializer_range, block_size, block_size, # 64, 0.01, 16, 16 attention_probs_dropout_prob, use_bias, seed, name="self") # 0.01, true, (0~11 seed encoder layer에 만큼 커짐) # Dense layers: attention 결과를 1)추출 -> 2)확장 -> 3)축소 하는 방식으로 Feature를 더 정교하게 뽑아내는 과정 # 1) 어텐션을 projection 하는 레이어 self.projection_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, # 12, 64 utils.create_initializer(initializer_range), None, "dense", use_bias) # 2) 확장 레이어 정의 self.expand_layer = utils.Dense2dLayer( intermediate_size, utils.create_initializer(initializer_range), intermediate_act_fn, "dense") # 3) 축소 레이어 정의 self.contract_layer = utils.Dense2dLayer( # 마지막 레이어 feature를 뽑아내는 레이어 hidden_size, utils.create_initializer(initializer_range), None, "dense") # Normalization layer self.first_layer_norm = utils.NormLayer() self.second_layer_norm = utils.NormLayer()
def __init__(self, attention_type, num_attention_heads=1, num_rand_blocks=3, size_per_head=512, initializer_range=0.02, from_block_size=64, to_block_size=64, attention_probs_dropout_prob=0.0, use_bias=True, seed=None, query_act=None, key_act=None, value_act=None, name=None, **kwargs): """Constructor for a multi-headed attention layer. Args: attention_type: Type of attention, needs to be one of ['original_full', 'simulated_sparse', 'block_sparse']. num_attention_heads: (optional) int. Number of attention heads. num_rand_blocks: (optional) int. Number of random chunks per row. size_per_head: (optional) int. Size of each attention head. initializer_range: (optional) float. Range of the weight initializer. from_block_size: (optional) int. size of block in from sequence. to_block_size: (optional) int. size of block in to sequence. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. use_bias: Whether the layer uses a bias vector. seed: (Optional) int. Reandom seed for generating random mask. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. name: The name scope of this layer. **kwargs: others """ super(MultiHeadedAttentionLayer, self).__init__(name=name, **kwargs) self.query_layer = utils.Dense3dLayer( num_attention_heads, size_per_head, utils.create_initializer(initializer_range), query_act, "query", head_first=True, use_bias=use_bias) self.key_layer = utils.Dense3dLayer( num_attention_heads, size_per_head, utils.create_initializer(initializer_range), key_act, "key", head_first=True, use_bias=use_bias) self.value_layer = utils.Dense3dLayer( num_attention_heads, size_per_head, utils.create_initializer(initializer_range), value_act, "value", head_first=True, use_bias=use_bias) def attn_impl(query, key, value, attention_mask, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, batch_size, from_seq_length, to_seq_length, training): if attention_type == "original_full": logging.info("**** Using original full attention ****") attn_fn = original_full_attention( query, key, value, attention_mask, size_per_head, attention_probs_dropout_prob if training else 0.0) elif attention_type == "simulated_sparse": logging.info("**** Using simulated sparse attention ****") attn_fn = bigbird_simulated_attention( query, key, value, attention_mask, num_attention_heads, num_rand_blocks, size_per_head, from_seq_length, to_seq_length, from_block_size, to_block_size, seed) elif attention_type == "block_sparse": logging.info("**** Using block sparse attention ****") attn_fn = bigbird_block_sparse_attention( query, key, value, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, num_attention_heads, num_rand_blocks, size_per_head, batch_size, from_seq_length, to_seq_length, from_block_size, to_block_size, seed) else: raise NotImplementedError( "Attention type {} is not implemented".format( attention_type)) return attn_fn self.attn_impl = attn_impl
def __init__(self, params, input_ids, token_type_ids=None, training=None): """Constructor for BertModel. Args: params: `BigBirdConfig` dictionary. input_ids: int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. training: Boolean indicating whether the call is training or inference. """ self.params = copy.deepcopy(params) self.scope = params["scope"] with tf.compat.v1.variable_scope( self.scope, reuse=tf.compat.v1.AUTO_REUSE) as vs: #token type 의 embedding을 위해서 따로 token type을 만들지 않았다면 모두 0번 token([CLS])의 embedding을 만들어줌 if token_type_ids is None: token_type_ids = tf.zeros_like(input_ids, dtype=tf.int32) # input_ids의 input_mask를 생성하는 부분 input_mask = tf.where(input_ids > 0, tf.ones_like(input_ids), tf.zeros_like(input_ids)) # 1) embedding process # 1-1 embedding layer정의 self.embeder = utils.EmbeddingLayer( vocab_size=self.params["vocab_size"], # 50358 emb_dim=self.params["hidden_size"], # 768 initializer=utils.create_initializer( self.params["initializer_range"]), #초기화 0.02 truncated_normal_initializer 사 scale_emb=self.params["rescale_embedding"], # false use_token_type=True, num_token_types=self.params["type_vocab_size"], # 2 use_position_embeddings=True, # position embedding 실행 max_position_embeddings=self.params["max_position_embeddings"],# 4096 dropout_prob=self.params["hidden_dropout_prob"]) # drop out 10% # 1-2 embedding layer 사용 token + token_type + position embedding_output = self.embeder.operation(input_ids, self.params["max_encoder_length"], token_type_ids=token_type_ids, training=training) # 2) encoder 레이어 정의 # 2-1 encoder 레이어 정의 self.encoder = encoder.EncoderStack(self.params) # 2-2 encoder 계산 Sparse Attention self.sequence_output = self.encoder.operation(embedding_output, input_mask, training) # 3) Pooling 레이어 정의 # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. first_token_tensor = self.sequence_output[:, 0, :] # [CLS] token에 대한 attetion 값을 가져(4, 768) # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained # 마지막 768 만큼 dense 계산 self.pooler = tf.compat.v1.layers.Dense( units=self.params["hidden_size"], activation=tf.tanh, kernel_initializer=utils.create_initializer( self.params["initializer_range"]), name="pooler/dense") # 결과 -> (4, 786) self.pooled_output = self.pooler(first_token_tensor) super(BertModel, self).__init__(name=self.scope, _scope=vs)
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" is_training = (mode == tf.estimator.ModeKeys.TRAIN) # BigBird model 정의 model = modeling.BertModel(bert_config, features["input_ids"], training=is_training, token_type_ids=features.get("segment_ids")) # attention feature와 cls token에 대한 pooling feature를 가져옴 sequence_output, pooled_output = model.get_output_feature() masked_lm = MaskedLMLayer( # masked language output 계산 모델 정의 bert_config["hidden_size"], bert_config["vocab_size"], model.embeder, input_tensor=sequence_output, label_ids=features.get("masked_lm_ids"), label_weights=features.get("masked_lm_weights"), masked_lm_positions=features.get("masked_lm_positions"), initializer=utils.create_initializer( bert_config["initializer_range"]), activation_fn=utils.get_activation(bert_config["hidden_act"])) masked_lm_loss, masked_lm_log_probs = masked_lm.get_mlm_loss() total_loss = masked_lm_loss tvars = tf.compat.v1.trainable_variables() utils.LogVariable(tvars, bert_config["ckpt_var_list"]) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: # optimize 계산 opt_model = optimization.LinearWarmupLinearDecay( # optimize model 불러옴 init_lr=bert_config["learning_rate"], num_train_steps=bert_config["num_train_steps"], num_warmup_steps=bert_config["num_warmup_steps"]) learning_rate = opt_model.get_learning_rate() # laernin rate 가져옴 optimizer = optimization.Optimizer(bert_config, learning_rate) optimizer = optimizer.get_optimizer() global_step = tf.compat.v1.train.get_global_step() gradients = optimizer.compute_gradients(total_loss, tvars) train_op = optimizer.apply_gradients(gradients, global_step=global_step) logging_hook = [ tf.compat.v1.train.LoggingTensorHook( {"loss is -> ": total_loss}, every_n_iter=256), tf.compat.v1.train.LoggingTensorHook( {"global step -> ": global_step}, every_n_iter=256), tf.compat.v1.train.LoggingTensorHook( {"learning rate -> ": learning_rate}, every_n_iter=256) ] output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, training_hooks=logging_hook, host_call=utils.add_scalars_to_summary( bert_config["output_dir"], {"learning_rate": learning_rate})) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_loss_value, masked_lm_log_probs, masked_lm_ids, masked_lm_weights): masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_accuracy = tf.compat.v1.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.compat.v1.metrics.mean( values=masked_lm_loss_value) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, } eval_metrics = (metric_fn, [ masked_lm_loss, masked_lm_log_probs, features["masked_lm_ids"], features["masked_lm_weights"] ]) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics) else: output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions={ "log-probabilities": masked_lm_log_probs, "seq-embeddings": sequence_output }) return output_spec