def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" with tf.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) logits = model(inputs, targets) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: if params["use_tpu"]: raise NotImplementedError( "Prediction is not yet supported on TPUs.") return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=logits, export_outputs={ "translate": tf.estimator.export.PredictOutput(logits) }) # Explicitly set the shape of the logits for XLA (TPU). This is needed # because the logits are passed back to the host VM CPU for metric # evaluation, and the shape of [?, ?, vocab_size] is too vague. However # it is known from Transformer that the first two dimensions of logits # are the dimensions of targets. Note that the ambiguous shape of logits is # not a problem when computing xentropy, because padded_cross_entropy_loss # resolves the shape on the TPU. logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:]) # Calculate model loss. # xentropy contains the cross entropy loss of every nonpadding token in the # targets. xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params["label_smoothing"], params["vocab_size"]) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) # Save loss as named tensor that will be logged with the logging hook. tf.identity(loss, "cross_entropy") if mode == tf.estimator.ModeKeys.EVAL: if params["use_tpu"]: # host call functions should only have tensors as arguments. # This lambda pre-populates params so that metric_fn is # TPUEstimator compliant. metric_fn = lambda logits, labels: (metrics.get_eval_metrics( logits, labels, params=params)) eval_metrics = (metric_fn, [logits, labels]) return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metrics=eval_metrics) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics( logits, labels, params)) else: train_op, metric_dict = get_train_op_and_metrics(loss, params) # Epochs can be quite long. This gives some intermediate information # in TensorBoard. metric_dict["minibatch_loss"] = loss if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=tpu_util.construct_scalar_host_call( metric_dict=metric_dict, model_dir=params["model_dir"], prefix="training/")) ####domyoung 2019.10.1#### #record_scalars(metric_dict) for key, value in metric_dict.items(): tf.summary.scalar(name=key, tensor=value) tf.logging.info(key) summary_hook = tf.train.SummarySaverHook( save_steps=20, output_dir=params["model_dir"], summary_op=tf.summary.merge_all()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=[summary_hook])
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" with tf.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) logits = model(inputs, targets) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: if params["use_tpu"]: raise NotImplementedError("Prediction is not yet supported on TPUs.") return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=logits, export_outputs={ "translate": tf.estimator.export.PredictOutput(logits) }) # Explicitly set the shape of the logits for XLA (TPU). This is needed # because the logits are passed back to the host VM CPU for metric # evaluation, and the shape of [?, ?, vocab_size] is too vague. However # it is known from Transformer that the first two dimensions of logits # are the dimensions of targets. Note that the ambiguous shape of logits is # not a problem when computing xentropy, because padded_cross_entropy_loss # resolves the shape on the TPU. logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:]) # Calculate model loss. # xentropy contains the cross entropy loss of every nonpadding token in the # targets. xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params["label_smoothing"], params["vocab_size"]) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) # Save loss as named tensor that will be logged with the logging hook. tf.identity(loss, "cross_entropy") if mode == tf.estimator.ModeKeys.EVAL: if params["use_tpu"]: # host call functions should only have tensors as arguments. # This lambda pre-populates params so that metric_fn is # TPUEstimator compliant. metric_fn = lambda logits, labels: ( metrics.get_eval_metrics(logits, labels, params=params)) eval_metrics = (metric_fn, [logits, labels]) return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metrics=eval_metrics) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics(logits, labels, params)) else: train_op, metric_dict = get_train_op_and_metrics(loss, params) # Epochs can be quite long. This gives some intermediate information # in TensorBoard. metric_dict["minibatch_loss"] = loss if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=tpu_util.construct_scalar_host_call( metric_dict=metric_dict, model_dir=params["model_dir"], prefix="training/") ) record_scalars(metric_dict) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" with tf.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. train = (mode == tf.estimator.ModeKeys.TRAIN) #model = transformer.Transformer(params, train) #model = transformer2.Transformer(params, train) model = transformer3.Transformer(params, train) logits, latent_sample, prior_mu, prior_logvar, recog_mu, recog_logvar = model( inputs, targets) # debug #print('latent_sample.shape', tf.shape(latent_sample)) #print('latent_sample.shape', latent_sample.shape[-1].value) #exit() # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: if params["use_tpu"]: raise NotImplementedError( "Prediction is not yet supported on TPUs.") return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=logits, export_outputs={ "translate": tf.estimator.export.PredictOutput(logits) }) # Explicitly set the shape of the logits for XLA (TPU). This is needed # because the logits are passed back to the host VM CPU for metric # evaluation, and the shape of [?, ?, vocab_size] is too vague. However # it is known from Transformer that the first two dimensions of logits # are the dimensions of targets. Note that the ambiguous shape of logits is # not a problem when computing xentropy, because padded_cross_entropy_loss # resolves the shape on the TPU. logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:]) # Calculate model loss. # xentropy contains the cross entropy loss of every nonpadding token in the # targets. xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params["label_smoothing"], params["vocab_size"]) # size: # xentropy: [batch_size, max(length_logits, length_labels)] # weights: [batch_size, max(length_logits, length_labels)], 0 or 1 #loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) real_batch_size = tf.to_float(tf.shape(logits)[0]) # get batch_size if params["word_avg"]: predict_loss_avg_in_sentence = tf.reduce_sum( xentropy, axis=1) / tf.reduce_sum(weights, axis=1) predict_loss = tf.reduce_sum( predict_loss_avg_in_sentence) / real_batch_size # 1.first average in sentence by word; # 2.then average in batch by sample. else: predict_loss = tf.reduce_sum(xentropy) / real_batch_size if train: # train mode # if use gaussian_kld_v2, the meaning of 'logvar' becomes standard deviation. if params["use_std"]: kl_loss = gaussian_kld_v2(recog_mu, recog_logvar, prior_mu, prior_logvar) else: kl_loss = gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar) kl_loss = tf.reduce_sum(kl_loss) / real_batch_size tf.identity(kl_loss, "kl_loss") # annealing if params["kl_weight"] == 'sigmoid': scaled_x = (tf.to_float(tf.train.get_or_create_global_step()) / params["full_kl_steps"] - 0.5) * 20.0 # sigmoid weight kl_loss_weight = 1.0 / (1 + tf.exp(-scaled_x)) elif params["kl_weight"] == 'linear': kl_loss_weights = tf.minimum( (tf.to_float(tf.train.get_or_create_global_step()) / params["full_kl_steps"]), 1.0) # linear weight else: kl_loss_weight = 1.0 weighted_kl_loss = kl_loss * kl_loss_weight tf.identity(weighted_kl_loss, "weighted_kl_loss") tf.identity(kl_loss_weight, "kl_loss_weight") if params["use_bow"]: bow_loss = compute_bow_loss(latent_sample, targets, params, train) loss = predict_loss + weighted_kl_loss + bow_loss tf.identity(bow_loss, "bow_loss") # total loss #TENSORS_TO_LOG["bow_loss"] = "model/bow_loss" else: loss = predict_loss + weighted_kl_loss else: # eval and infer modes loss = predict_loss # Save loss as named tensor that will be logged with the logging hook. tf.identity(predict_loss, "predict_loss") tf.identity(loss, "cross_entropy") # total loss if mode == tf.estimator.ModeKeys.EVAL: if params["use_tpu"]: # host call functions should only have tensors as arguments. # functools.partial() pre-populates params so that metric_fn is # TPUEstimator compliant. metric_fn = functools.partial(metrics.get_eval_metrics, params=params) eval_metrics = (metric_fn, [logits, labels]) return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metrics=eval_metrics) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics( logits, labels, params)) else: train_op, metric_dict = get_train_op_and_metrics(loss, params) # Epochs can be quite long. This gives some intermediate information # in TensorBoard. #metric_dict["minibatch_loss"] = loss metric_dict["predict_loss"] = predict_loss metric_dict["kl_loss"] = kl_loss if params["use_bow"]: metric_dict["bow_loss"] = bow_loss if params["kl_weight"]: metric_dict["weighted_kl_loss"] = weighted_kl_loss metric_dict["kl_loss_weight"] = kl_loss_weight if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=tpu_util.construct_scalar_host_call( metric_dict=metric_dict, model_dir=params["model_dir"], prefix="training/")) record_scalars(metric_dict) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer_classifier model.""" with tf.variable_scope("model"): inputs = features # Create model and get output logits. model = transformer.TransformerClassifier(params, mode) logits = model(inputs) # When in prediction mode, the model output is the prediction if mode == tf.estimator.ModeKeys.PREDICT: if params["use_tpu"]: raise NotImplementedError( "Prediction is not yet supported on TPUs.") return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=logits, export_outputs={ "classify": tf.estimator.export.PredictOutput(logits) }) # Calculate model loss. xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) loss = tf.reduce_mean(xentropy) # Save loss as named tensor that will be logged with the logging hook. tf.identity(loss, "cross_entropy") if mode == tf.estimator.ModeKeys.EVAL: if params["use_tpu"]: # host call functions should only have tensors as arguments. # This lambda pre-populates params so that metric_fn is # TPUEstimator compliant. metric_fn = lambda logits, labels: (metrics.get_eval_metrics( logits, labels, params=params)) eval_metrics = (metric_fn, [logits, inputs]) return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metrics=eval_metrics) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics( logits, inputs, params)) else: train_op, metric_dict = get_train_op_and_metrics(loss, params) # Epochs can be quite long. This gives some intermediate information # in TensorBoard. metric_dict["minibatch_loss"] = loss if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=tpu_util.construct_scalar_host_call( metric_dict=metric_dict, model_dir=params["model_dir"], prefix="training/")) record_scalars(metric_dict) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" #tf.set_random_seed(1367) with tf.variable_scope("model"): inputs, targets = features, labels concrete_loss = tf.constant(0) total_loss = tf.constant(0) concrete_reg = tf.constant(0) sparsity_rate = tf.constant(0) gate_values = tf.constant(0) # =================== For concrete gates ================================== print("**** concrete heads has this : {} ****".format(params["concrete_heads"])) if not params["concrete_coef"] == 0: tf.get_default_graph().clear_collection("CONCRETE") tf.get_default_graph().clear_collection(tf.GraphKeys.REGULARIZATION_LOSSES) # ========================================================================= # Create model and get output logits. model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) logits = model(inputs, targets) #print('logits') #print(len(logits)) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: if params["use_tpu"]: raise NotImplementedError("Prediction is not yet supported on TPUs.") print ("Logits", logits) #print (logits["attn_weights"], tf.transpose(tf.stack(logits["attn_weights"]).get_shape(), perm=[1,0,2,3,4])) return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions={"outputs": logits["outputs"], "scores": logits["scores"]}) #export_outputs={ # "translate": tf.estimator.export.PredictOutput(logits["outputs"]) #}) # Explicitly set the shape of the logits for XLA (TPU). This is needed # because the logits are passed back to the host VM CPU for metric # evaluation, and the shape of [?, ?, vocab_size] is too vague. However # it is known from Transformer that the first two dimensions of logits # are the dimensions of targets. Note that the ambiguous shape of logits is # not a problem when computing xentropy, because padded_cross_entropy_loss # resolves the shape on the TPU. logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:]) # Calculate model loss. # xentropy contains the cross entropy loss of every nonpadding token in the # targets. xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params["label_smoothing"], params["vocab_size"]) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) # Save loss as named tensor that will be logged with the logging hook. tf.identity(loss, "cross_entropy") # ============ Loss for concrete gates ================= if not params["concrete_coef"] == 0: concrete_coef = params["concrete_coef"] sparsity_rate = tf.reduce_mean(tf.get_collection("CONCRETE")) concrete_reg = tf.reduce_mean(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) concrete_loss = concrete_coef * tf.reduce_mean(concrete_reg) total_loss = loss + concrete_loss gate_values = tf.get_collection("GATEVALUES") tf.identity(concrete_loss, "concrete_loss") tf.identity(total_loss, "total_loss") tf.identity(concrete_reg, "concrete_reg") tf.identity(sparsity_rate, "sparsity_rate") tf.identity(gate_values, "gate_values") loss = total_loss else: tf.identity(concrete_loss, "concrete_loss") tf.identity(total_loss, "total_loss") tf.identity(concrete_reg, "concrete_reg") tf.identity(sparsity_rate, "sparsity_rate") tf.identity(gate_values, "gate_values") # ======================================================= if mode == tf.estimator.ModeKeys.EVAL: if params["use_tpu"]: # host call functions should only have tensors as arguments. # This lambda pre-populates params so that metric_fn is # TPUEstimator compliant. metric_fn = lambda logits, labels: ( metrics.get_eval_metrics(logits, labels, params=params)) eval_metrics = (metric_fn, [logits, labels]) return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metrics=eval_metrics) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics(logits, labels, params)) else: train_op, metric_dict = get_train_op_and_metrics(loss, params) # Epochs can be quite long. This gives some intermediate information # in TensorBoard. metric_dict["minibatch_loss"] = loss if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=tpu_util.construct_scalar_host_call( metric_dict=metric_dict, model_dir=params["model_dir"], prefix="training/") ) record_scalars(metric_dict) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)