def __init__(self, params): name = "encoder" super(EncoderStack, self).__init__(name=name) self.params = params if params["norm_type"] == "prenorm": # layer norm type을 설졍 encoder_class = PrenormEncoderLayer elif params["norm_type"] == "postnorm": # 기본 postnorm encoder 사 encoder_class = PostnormEncoderLayer else: raise NotImplementedError( "Norm type {} is not implemented".format(params["norm_type"])) # Encoder layers self.encoder_layers = [ encoder_class( # pylint: disable=g-complex-comprehension self.params["attention_type"], # block_sparse attention type 설정 self.params["hidden_size"], # 768 self.params["intermediate_size"], # intermediate_size utils.get_activation(self.params["hidden_act"]), # gelu activation function self.params["attention_probs_dropout_prob"], # 0.1 self.params["hidden_dropout_prob"], # 0.1 self.params["initializer_range"], # 0.02 self.params["num_attention_heads"], # num_attention_heads self.params["num_rand_blocks"], # rand block : 3 self.params["block_size"], # 16 self.params["use_bias"], # True seed=layer_idx, name="layer_%d" % layer_idx) for layer_idx in range(self.params["num_hidden_layers"]) # 개 encoder 12개를 list에 담음 ] # Normalization layer self.layer_norm = utils.NormLayer()
def __init__(self, params): name = "encoder" super(EncoderStack, self).__init__(name=name) self.params = params if params["norm_type"] == "prenorm": encoder_class = PrenormEncoderLayer elif params["norm_type"] == "postnorm": encoder_class = PostnormEncoderLayer else: raise NotImplementedError("Norm type {} is not implemented".format( params["norm_type"])) # Encoder layers self.encoder_layers = [ encoder_class( # pylint: disable=g-complex-comprehension self.params["attention_type"], self.params["hidden_size"], self.params["intermediate_size"], utils.get_activation(self.params["hidden_act"]), self.params["attention_probs_dropout_prob"], self.params["hidden_dropout_prob"], self.params["initializer_range"], self.params["num_attention_heads"], self.params["num_rand_blocks"], self.params["block_size"], self.params["use_bias"], seed=layer_idx, name="layer_%d" % layer_idx) for layer_idx in range(self.params["num_hidden_layers"]) ] # Normalization layer self.layer_norm = utils.NormLayer()
def __init__(self, params): if params["couple_encoder_decoder"]: name = "encoder" super(DecoderStack, self).__init__(name=name) else: name = "decoder" super(DecoderStack, self).__init__(name=name) self.params = params if params["norm_type"] == "prenorm": decoder_class = PrenormDecoderLayer elif params["norm_type"] == "postnorm": decoder_class = PostnormDecoderLayer else: raise NotImplementedError("Norm type {} is not implemented".format( params["norm_type"])) if params["use_gradient_checkpointing"]: decoder_class = add_gradient_recomputation(decoder_class) if self.params.get("num_decoder_layers", None) is not None: num_hidden_layers = self.params["num_decoder_layers"] else: num_hidden_layers = self.params["num_hidden_layers"] with tf.compat.v1.variable_scope(name): # Decoder layers self.decoder_layers = [ decoder_class( # pylint: disable=g-complex-comprehension self.params["hidden_size"], self.params["intermediate_size"], utils.get_activation(self.params["hidden_act"]), self.params["attention_probs_dropout_prob"], self.params["hidden_dropout_prob"], self.params["initializer_range"], self.params["num_attention_heads"], self.params["use_bias"], name="layer_%d" % layer_idx) for layer_idx in range(num_hidden_layers) ] # Normalization layer self.layer_norm = utils.NormLayer(self.params["hidden_size"])
def __init__(self, params): if params["couple_encoder_decoder"]: # encoder 같다 name = "encoder" with tf.compat.v1.variable_scope( name, reuse=tf.compat.v1.AUTO_REUSE) as scope: super(DecoderStack, self).__init__(name=name, _scope=scope) else: name = "decoder" super(DecoderStack, self).__init__(name=name) self.params = params if params["norm_type"] == "prenorm": # decoder prenorm 선택 decoder_class = PrenormDecoderLayer elif params["norm_type"] == "postnorm": decoder_class = PostnormDecoderLayer else: raise NotImplementedError( "Norm type {} is not implemented".format(params["norm_type"])) if self.params.get("num_decoder_layers", None) is not None: # decoder number layer 를 설정했다 num_hidden_layers = self.params["num_decoder_layers"] else: num_hidden_layers = self.params["num_hidden_layers"]# 하지 않았다면 기존 number layer 사 # Decoder layers self.decoder_layers = [ decoder_class( # pylint: disable=g-complex-comprehension self.params["hidden_size"], self.params["intermediate_size"], utils.get_activation(self.params["hidden_act"]), self.params["attention_probs_dropout_prob"], self.params["hidden_dropout_prob"], self.params["initializer_range"], self.params["num_attention_heads"], self.params["use_bias"], name="layer_%d" % layer_idx) for layer_idx in range(num_hidden_layers) ] # Normalization layer self.layer_norm = utils.NormLayer()
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = modeling.BertModel(bert_config) masked_lm = MaskedLMLayer(bert_config["hidden_size"], bert_config["vocab_size"], model.embeder, initializer=utils.create_initializer( bert_config["initializer_range"]), activation_fn=utils.get_activation( bert_config["hidden_act"])) next_sentence = NSPLayer(bert_config["hidden_size"], initializer=utils.create_initializer( bert_config["initializer_range"])) sequence_output, pooled_output = model( features["input_ids"], training=is_training, token_type_ids=features.get("segment_ids")) masked_lm_loss, masked_lm_log_probs = masked_lm( sequence_output, label_ids=features.get("masked_lm_ids"), label_weights=features.get("masked_lm_weights"), masked_lm_positions=features.get("masked_lm_positions")) next_sentence_loss, next_sentence_log_probs = next_sentence( pooled_output, features.get("next_sentence_labels")) total_loss = masked_lm_loss if bert_config["use_nsp"]: total_loss += next_sentence_loss tvars = tf.compat.v1.trainable_variables() utils.log_variables(tvars, bert_config["ckpt_var_list"]) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: learning_rate = optimization.get_linear_warmup_linear_decay_lr( init_lr=bert_config["learning_rate"], num_train_steps=bert_config["num_train_steps"], num_warmup_steps=bert_config["num_warmup_steps"]) optimizer = optimization.get_optimizer(bert_config, learning_rate) global_step = tf.compat.v1.train.get_global_step() gradients = optimizer.compute_gradients(total_loss, tvars) train_op = optimizer.apply_gradients(gradients, global_step=global_step) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=utils.add_scalars_to_summary( bert_config["output_dir"], {"learning_rate": learning_rate})) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_loss_value, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_loss_value, next_sentence_log_probs, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_accuracy = tf.compat.v1.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.compat.v1.metrics.mean( values=masked_lm_loss_value) next_sentence_predictions = tf.argmax(next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_accuracy = tf.compat.v1.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.compat.v1.metrics.mean( values=next_sentence_loss_value) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } eval_metrics = (metric_fn, [ masked_lm_loss, masked_lm_log_probs, features["masked_lm_ids"], features["masked_lm_weights"], next_sentence_loss, next_sentence_log_probs, features["next_sentence_labels"] ]) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics) else: output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions={ "log-probabilities": masked_lm_log_probs, "seq-embeddings": sequence_output }) return output_spec
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" is_training = (mode == tf.estimator.ModeKeys.TRAIN) # BigBird model 정의 model = modeling.BertModel(bert_config, features["input_ids"], training=is_training, token_type_ids=features.get("segment_ids")) # attention feature와 cls token에 대한 pooling feature를 가져옴 sequence_output, pooled_output = model.get_output_feature() masked_lm = MaskedLMLayer( # masked language output 계산 모델 정의 bert_config["hidden_size"], bert_config["vocab_size"], model.embeder, input_tensor=sequence_output, label_ids=features.get("masked_lm_ids"), label_weights=features.get("masked_lm_weights"), masked_lm_positions=features.get("masked_lm_positions"), initializer=utils.create_initializer( bert_config["initializer_range"]), activation_fn=utils.get_activation(bert_config["hidden_act"])) masked_lm_loss, masked_lm_log_probs = masked_lm.get_mlm_loss() total_loss = masked_lm_loss tvars = tf.compat.v1.trainable_variables() utils.LogVariable(tvars, bert_config["ckpt_var_list"]) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: # optimize 계산 opt_model = optimization.LinearWarmupLinearDecay( # optimize model 불러옴 init_lr=bert_config["learning_rate"], num_train_steps=bert_config["num_train_steps"], num_warmup_steps=bert_config["num_warmup_steps"]) learning_rate = opt_model.get_learning_rate() # laernin rate 가져옴 optimizer = optimization.Optimizer(bert_config, learning_rate) optimizer = optimizer.get_optimizer() global_step = tf.compat.v1.train.get_global_step() gradients = optimizer.compute_gradients(total_loss, tvars) train_op = optimizer.apply_gradients(gradients, global_step=global_step) logging_hook = [ tf.compat.v1.train.LoggingTensorHook( {"loss is -> ": total_loss}, every_n_iter=256), tf.compat.v1.train.LoggingTensorHook( {"global step -> ": global_step}, every_n_iter=256), tf.compat.v1.train.LoggingTensorHook( {"learning rate -> ": learning_rate}, every_n_iter=256) ] output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, training_hooks=logging_hook, host_call=utils.add_scalars_to_summary( bert_config["output_dir"], {"learning_rate": learning_rate})) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_loss_value, masked_lm_log_probs, masked_lm_ids, masked_lm_weights): masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_accuracy = tf.compat.v1.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.compat.v1.metrics.mean( values=masked_lm_loss_value) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, } eval_metrics = (metric_fn, [ masked_lm_loss, masked_lm_log_probs, features["masked_lm_ids"], features["masked_lm_weights"] ]) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics) else: output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, predictions={ "log-probabilities": masked_lm_log_probs, "seq-embeddings": sequence_output }) return output_spec