def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" with tf.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) output = model(inputs, targets) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=output) logits = output # Calculate model loss. xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params.label_smoothing, params.vocab_size) loss = tf.reduce_sum(xentropy * weights) / tf.reduce_sum(weights) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics(logits, labels, params)) else: train_op = get_train_op(loss, params) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def build_no_teacher_discriminator(self, origin_inputs, gen_target, real_loss, margin=1.0): fake_attention_bias = model_utils.get_padding_bias( gen_target) # [batch, 1, 1, src_len] fake_encoder_outputs = self.encode( gen_target, fake_attention_bias) # [batch, src_len, hidden_size] _, fake_logits = self.argmax_predict(fake_encoder_outputs, fake_attention_bias) fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss( fake_logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) # [batch, origin_length] fake_loss = tf.reduce_sum(fake_xentropy, axis=1) / tf.reduce_sum( fake_weights, axis=1) tf.identity(fake_loss[:5], "fake_loss") mean_fake_loss = tf.reduce_mean(fake_loss, name="mean_fake_loss") tf.summary.scalar("mean_fake_loss", mean_fake_loss) rewards = 1 / tf.maximum(margin, fake_loss / (real_loss + 1e-12) - 1) # [batch] tf.identity(rewards[:5], "rewards") mean_wards = tf.reduce_mean(rewards, name="mean_wards") tf.summary.scalar("mean_wards", mean_wards) return rewards
def evaluation(model, input_fn): tf.logging.info("!!!Build graph for evaluation!!!") logits = model.build_pretrain(input_fn.source, input_fn.target) xentropy, weights = metrics.padded_cross_entropy_loss( logits, input_fn.target, params.label_smoothing, params.target_vocab_size) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) return loss, logits, input_fn.target
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" with tf.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) logits = model(inputs, targets) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=logits, export_outputs={ "translate": tf.estimator.export.PredictOutput(logits) }) # Explicitly set the shape of the logits for XLA (TPU). This is needed # because the logits are passed back to the host VM CPU for metric # evaluation, and the shape of [?, ?, vocab_size] is too vague. However # it is known from Transformer that the first two dimensions of logits # are the dimensions of targets. Note that the ambiguous shape of logits is # not a problem when computing xentropy, because padded_cross_entropy_loss # resolves the shape on the TPU. logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:]) # Calculate model loss. # xentropy contains the cross entropy loss of every nonpadding token in the # targets. xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params["label_smoothing"], params["vocab_size"]) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) # Save loss as named tensor that will be logged with the logging hook. tf.identity(loss, "cross_entropy") if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics( logits, labels, params)) else: train_op, metric_dict = get_train_op_and_metrics(loss, params) # Epochs can be quite long. This gives some intermediate information # in TensorBoard. metric_dict["minibatch_loss"] = loss record_scalars(metric_dict) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def evaluation(model, input_fn): tf.logging.info("!!!Build graph for evaluation!!!") #model = transformer_5.Transformer(params, is_train=True) #predictions = model.build_pretrain(input_fn.source, targets=None) logits = model.build_pretrain(input_fn.source, input_fn.target) xentropy, weights = metrics.padded_cross_entropy_loss( logits, input_fn.target, params.label_smoothing, params.target_vocab_size) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) #return predictions, input_fn.target return loss, logits, input_fn.target
def get_loss(self, origin_inputs, targets): with tf.variable_scope("Discriminator", initializer=self._initializer, reuse=tf.AUTO_REUSE): attention_bias = model_utils.get_padding_bias(targets) # [batch, 1, 1, src_len] encoder_outputs = self.encode(targets, attention_bias) # [batch, src_len, hidden_size] logits = self.decode(origin_inputs, encoder_outputs, attention_bias) xentropy, weights = metrics.padded_cross_entropy_loss( logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) # [batch, origin_length] self.loss = tf.reduce_sum(xentropy, axis=1) / tf.reduce_sum(weights, axis=1) # [batch] #prediction = self.argmax_predict(encoder_outputs, attention_bias) # [batch, max_len] return tf.reshape(self.loss, (-1, 1)) # [batch, 1]
def get_loss(self, gen_targets, real_inputs): with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=tf.AUTO_REUSE): attention_bias = model_utils.get_padding_bias(gen_targets) encoder_outputs = self.encode(gen_targets, attention_bias) logits = self.decode(real_inputs, encoder_outputs, attention_bias) xentropy, weights = metrics.padded_cross_entropy_loss(logits, real_inputs, self.params.label_smoothing, self.params.target_vocab_size) loss = tf.reduce_sum(xentropy, axis=1) / tf.reduce_sum(weights, axis=1) # [batch, 1] return tf.reshape(loss, (-1, 1))
def get_teach_real_loss(self, origin_inputs, origin_target): real_logits = self.build_pretrain( inputs=origin_target, targets=origin_inputs) # [batch, tgt_len, vocab_size] real_xentropy, real_weights = metrics.padded_cross_entropy_loss( real_logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) real_loss = tf.reduce_sum(real_xentropy, axis=1) / tf.reduce_sum( real_weights, axis=1) # [batch] tf.identity(real_loss[:5], "real_loss") mean_real_loss = tf.reduce_mean(real_loss, name="mean_real_loss") tf.summary.scalar("mean_real_loss", mean_real_loss) return real_loss
def train_step(batch_data): src_inputs, tgt_input_ids, tgt_output_ids, src_path, src_len, tgt_len = batch_data with tf.GradientTape() as tape: logits = model(batch_data, training=True) xentropy, weights = metrics.padded_cross_entropy_loss( logits, tgt_output_ids, config.label_smoothing, vocab_size=tgt_vocab_size) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) variables = model.Encoder.trainable_variables + model.Decoder.trainable_variables gradients = tape.gradient(target=loss, sources=variables) grads_and_vars = zip(gradients, variables) optimizer.apply_gradients(grads_and_vars) return loss
def gan_tower_loss(scope, model, input_fn): """ calculate the total loss on a single tower runing the train model. :param scope: :param src: :param tgt: :return: """ # Build inference Graph. #model = transformer_5.Transformer(params, is_train=True) logits = model.build_pretrain(input_fn.source, input_fn.target) # Build the portion of the Graph calculating the losses. Note that we will # assemble the total_loss using a custom function below. xentropy, weights = metrics.padded_cross_entropy_loss( logits, input_fn.target, params.label_smoothing, params.target_vocab_size) cross_entropy_mean = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) tf.add_to_collection("losses", cross_entropy_mean) #_ = get_loss(logits, input_fn.target, "loss", "total_loss") losses = tf.get_collection('losses', scope) # Calculate the total loss for the current tower. total_loss = tf.add_n(losses, name='total_loss') gen_samples = model.build_generator(input_fn.source) given_num, rewards_mb = model.get_one_reward_baseline( origin_inputs=input_fn.source, gen_targets=gen_samples, roll_num=flags_obj.roll_num) g_loss = model.get_one_g_loss(gen_targets=gen_samples, given_num=given_num, rewards=rewards_mb) tf.add_to_collection("g_losses", g_loss) g_losses = tf.get_collection("g_losses", scope) total_g_loss = tf.add_n(g_losses, name="total_g_loss") # Attach a scalar summary to all individual losses and the total loss; do the # same for the averaged version of the losses. for l in losses + [total_loss] + g_losses + [total_g_loss]: # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training # session. This helps the clarity of presentation on tensorboard. loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name) tf.summary.scalar(loss_name, l) return total_loss, total_g_loss, rewards_mb
def get_real_loss(self, origin_inputs, origin_target): with tf.variable_scope("Discriminator", initializer=self._initializer, reuse=tf.AUTO_REUSE): real_attention_bias = model_utils.get_padding_bias( origin_target) # [batch, 1, 1, src_len] real_encoder_outputs = self.encode( origin_target, real_attention_bias) # [batch, src_len, hidden_size] real_logits = self.decode(origin_inputs, real_encoder_outputs, real_attention_bias) real_xentropy, real_weights = metrics.padded_cross_entropy_loss( real_logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) self.real_loss = tf.reduce_sum(real_xentropy) / tf.reduce_sum( real_weights) # [batch] return self.real_loss
def get_fake_loss(self, origin_inputs, gen_targets): inputs_length = tf.argmin(gen_targets, axis=-1) + 1 max_len = inputs_length[tf.argmax(inputs_length)] batch_size = tf.shape(gen_targets)[0] pad_gen_targets = tf.zeros([0, max_len], dtype=tf.int32) def inner_loop(i, pad_inputs): ori_length = inputs_length[i] ori_input = tf.reshape(gen_targets[i][:ori_length], [1, -1]) pad_input = tf.pad(ori_input, [[0, 0], [0, max_len - ori_length]]) pad_inputs = tf.concat([pad_inputs, pad_input], axis=0) return i + 1, pad_inputs _, pad_gen_targets = tf.while_loop( cond=lambda i, _: i < batch_size, body=inner_loop, loop_vars=[tf.constant(0), pad_gen_targets], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None, None]) ]) gen_targets = pad_gen_targets with tf.variable_scope("Discriminator", initializer=self._initializer, reuse=tf.AUTO_REUSE): fake_attention_bias = model_utils.get_padding_bias( gen_targets) # [batch, 1, 1, src_len] fake_encoder_outputs = self.encode( gen_targets, fake_attention_bias) # [batch, src_len, hidden_size] fake_logits = self.decode(origin_inputs, fake_encoder_outputs, fake_attention_bias) fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss( fake_logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) # [batch, origin_length] self.fake_loss = tf.reduce_sum(fake_xentropy) / tf.reduce_sum( fake_weights) #fake_prediction = self.argmax_predict(fake_encoder_outputs, fake_attention_bias) # [batch, max_len] return self.fake_loss
def build_teach_force_discriminator(self, origin_inputs, gen_target, real_loss, margin=1): fake_logits = self.build_pretrain( inputs=gen_target, targets=origin_inputs) # [batch, tgt_length, vocab_size] fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss( fake_logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) # [batch, origin_length] fake_loss = tf.reduce_sum(fake_xentropy, axis=1) / tf.reduce_sum( fake_weights, axis=1) tf.identity(fake_loss[:5], "fake_loss") mean_fake_loss = tf.reduce_mean(fake_loss, name="mean_fake_loss") tf.summary.scalar("mean_fake_loss", mean_fake_loss) rewards = 1 / tf.maximum(margin, fake_loss / (real_loss + 1e-12) - 1) # [batch] tf.identity(rewards[:5], "rewards") mean_wards = tf.reduce_mean(rewards, name="mean_wards") tf.summary.scalar("mean_wards", mean_wards) return rewards
def eval(): """internal evaluation """ dev_dataset = dataset.get_train_dataset(src_file=config.eval_src_file, tgt_file=config.eval_tgt_file, tgt_vocab_table=tgt_vocab_table, batch_size=config.batch_size) total_cnt, total_loss, total_bleu = 0.0, 0.0, 0.0 for batch_num, batch_data in enumerate(dev_dataset.take(config.debug_num)): src_inputs, tgt_input_ids, tgt_output_ids, src_path, src_len, tgt_len = batch_data logits = model(batch_data, training=True) bs = logits.shape[0] xentropy, weights = metrics.padded_cross_entropy_loss( logits, tgt_output_ids, config.label_smoothing, vocab_size=tgt_vocab_size) batch_loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) batch_bleu = metrics.bleu_score(logits=logits, labels=tgt_output_ids) total_cnt += bs total_loss += bs * batch_loss total_bleu += bs * batch_bleu eval_loss = total_loss / total_cnt eval_bleu = total_bleu / total_cnt return eval_bleu, eval_loss
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" if params.frozen_graph and mode == tf.estimator.ModeKeys.PREDICT: print("Reading***** From *** pb", flush=True) input_map = {'input_tokens': features} output_names = [ 'model/Transformer/strided_slice_15', 'model/Transformer/strided_slice_16' ] with tf.io.gfile.GFile(params.frozen_graph, "rb") as f: graph_def = tf.compat.v1.GraphDef() graph_def.ParseFromString(f.read()) tf.graph_util.import_graph_def(graph_def, input_map, output_names, name="") output_tensors = [ tf.compat.v1.get_default_graph().get_tensor_by_name(name + ":0") for name in output_names ] output = {'outputs': output_tensors[0], 'scores': output_tensors[1]} return tf.estimator.EstimatorSpec(tf.estimator.ModeKeys.PREDICT, predictions=output) else: with tf.compat.v1.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. model = transformer.Transformer( params, mode == tf.estimator.ModeKeys.TRAIN) output = model(inputs, targets) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=output) logits = output # Calculate model loss. xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params.label_smoothing, params.vocab_size) loss = tf.reduce_sum(input_tensor=xentropy * weights) / tf.reduce_sum(input_tensor=weights) if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics( logits, labels, params)) else: train_op = get_train_op(loss, params) logging_hook = tf.compat.v1.train.LoggingTensorHook( {"loss": loss}, every_n_iter=FLAGS.print_iter) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
def build_discriminator(self, origin_inputs, gen_target, margin, real_loss, given_num=None, discount_factor=0.95): fake_logits = self.build_pretrain( inputs=gen_target, targets=origin_inputs) # [batch, tgt_length, vocab_size] fake_xentropy, fake_weights = metrics.padded_cross_entropy_loss( fake_logits, origin_inputs, self.params.label_smoothing, self.params.target_vocab_size) # [batch, origin_length] #print("fake_xentropy:", fake_xentropy.shape) #print("-------given_num-----", given_num) #fake_xentropy = tf.transpose(fake_xentropy, perm=[1, 0]) # [tgt_len, batch] #tgt_len = tf.shape(fake_xentropy)[0] # #def _unstack_ta(inp): # return tf.TensorArray( # dtype=inp.dtype, size=tf.shape(inp)[0], # element_shape=inp.get_shape()[1:]).unstack(inp) # #ta_fake_xentropy = nest.map_structure(_unstack_ta, fake_xentropy) #def _create_ta(inp): # return tf.TensorArray( # dtype=tf.float32, # size=tgt_len, # dynamic_size=False, # element_shape=inp.get_shape()[1:]) #discounted_fake_loss = nest.map_structure(_create_ta, fake_xentropy) #def inner_loop_1(i, ta_fake_xentropy, discounted_fake_loss): # print("aaaaa", (i, given_num)) # disc_loss = ta_fake_xentropy.read(i) # discounted_fake_loss = nest.map_structure(lambda ta, out: ta.write(i, out), # discounted_fake_loss, disc_loss) # return i + 1, ta_fake_xentropy, discounted_fake_loss # #def inner_loop_2(i, ta_fake_xentropy, discounted_fake_loss): # print("bbbbbb", (i, tgt_len)) # disc_loss = ta_fake_xentropy.read(i) * (discount_factor ** tf.to_float(i - given_num)) # discounted_fake_loss = nest.map_structure(lambda ta, out: ta.write(i, out), # discounted_fake_loss, disc_loss) # return i + 1, ta_fake_xentropy, discounted_fake_loss # ## i < given_num #i, ta_fake_xentropy, discounted_fake_loss = tf.while_loop( # cond=lambda i, _1, _2: tf.less(i, given_num), # body=inner_loop_1, # loop_vars=[tf.constant(0), ta_fake_xentropy, discounted_fake_loss], #) ## # # i >= given_num #i, ta_fake_xentropy, discounted_fake_loss = tf.while_loop( # cond=lambda i, _1, _2: tf.less(i, tgt_len), # body=inner_loop_2, # loop_vars=[i, ta_fake_xentropy, discounted_fake_loss], #) #fake_loss = tf.transpose(discounted_fake_loss.stack(), perm=[1, 0]) # [batch, tgt_length] #fake_loss = tf.reduce_sum(fake_loss, axis=1) / tf.reduce_sum(fake_weights, axis=1) fake_loss = tf.reduce_sum(fake_xentropy, axis=1) / tf.reduce_sum( fake_weights, axis=1) tf.identity(fake_loss[:5], "fake_loss") mean_fake_loss = tf.reduce_mean(fake_loss, name="mean_fake_loss") tf.summary.scalar("mean_fake_loss", mean_fake_loss) rewards = 1 / tf.maximum(0.2, fake_loss / (real_loss + 1e-12) - 1) # [batch] tf.identity(rewards[:5], "rewards") mean_wards = tf.reduce_mean(rewards, name="mean_wards") tf.summary.scalar("mean_wards", mean_wards) return rewards
def model_fn(features, labels, mode: tf.estimator.ModeKeys, params: dict): """ :param features: encode_inputs = features['encode_feature_name'] :param labels: :param mode: :param params: :return: """ with tf.variable_scope('model'): inputs = features transformer = Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) logits = transformer(inputs, labels) """ when in prediction mode, the labels and decode_inputs is None, the model output id the prediction it is a dict {"outputs": top_decoded_ids, "scores": top_scores} """ if mode == tf.estimator.ModeKeys.PREDICT: estimator = tf.estimator.EstimatorSpec( mode=mode, predictions=logits, export_outputs={ 'translate': tf.estimator.export.PredictOutput(logits) }) return estimator logits.set_shape(labels.shape.as_list() + logits.shape.as_list()[2:]) xentropy, weights = metrics.padded_cross_entropy_loss( logits=logits, labels=labels, smoothing=params.get('label_smoothing'), vocab_size=params.get('vocab_size')) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) tf.identity(loss, 'cross_entropy') if mode == tf.estimator.ModeKeys.EVAL: estimator = tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={'predictions': logits}, eval_metric_ops=metrics.get_eval_metrics( logits, labels, params)) return estimator if mode == tf.estimator.ModeKeys.TRAIN: train_op, metrics_dict = model_utils.get_train_op_and_metrics( loss, params) metrics_dict['mini_batch_loss'] = loss model_utils.record_scalars(metrics_dict) estimator = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) return estimator
def get_loss(logits, labels, scope_name_1, scope_name_2): xentropy, weights = metrics.padded_cross_entropy_loss( logits, labels, params.label_smoothing, params.target_vocab_size) cross_entropy_mean = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) tf.add_to_collection(scope_name_1, cross_entropy_mean) return tf.add_n(tf.get_collection(scope_name_1), name=scope_name_2)
def model_fn(features, labels, mode, params): """Defines how to train, evaluate and predict from the transformer model.""" with tf.variable_scope("model"): inputs, targets = features, labels # Create model and get output logits. model = transformer.Transformer(params, mode == tf.estimator.ModeKeys.TRAIN) # 如果是predict: # returns a dictionary { # output: [batch_size, decoded length] # score: [batch_size, float]} # else: # Returns: # float32 tensor with shape [batch_size, target_length, vocab_size] logits = model(inputs, targets) # When in prediction mode, the labels/targets is None. The model output # is the prediction if mode == tf.estimator.ModeKeys.PREDICT: if params["use_tpu"]: raise NotImplementedError("Prediction is not yet supported on TPUs.") return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.PREDICT, predictions=logits, export_outputs={ "translate": tf.estimator.export.PredictOutput(logits) }) # Explicitly set the shape of the logits for XLA (TPU). This is needed # because the logits are passed back to the host VM CPU for metric # evaluation, and the shape of [?, ?, vocab_size] is too vague. However # it is known from Transformer that the first two dimensions of logits # are the dimensions of targets. Note that the ambiguous shape of logits is # not a problem when computing xentropy, because padded_cross_entropy_loss # resolves the shape on the TPU. logits.set_shape(targets.shape.as_list() + logits.shape.as_list()[2:]) # Calculate model loss. # xentropy contains the cross entropy loss of every nonpadding token in the # targets. # 训练时,labels 为0(即<PAD>)的对应loss的weight被置0 xentropy, weights = metrics.padded_cross_entropy_loss( logits, targets, params["label_smoothing"], params["vocab_size"]) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) # Save loss as named tensor that will be logged with the logging hook. tf.identity(loss, "cross_entropy") if mode == tf.estimator.ModeKeys.EVAL: if params["use_tpu"]: # host call functions should only have tensors as arguments. # This lambda pre-populates params so that metric_fn is # TPUEstimator compliant. def metric_fn(logits, labels): return ( metrics.get_eval_metrics(logits, labels, params=params)) eval_metrics = (metric_fn, [logits, labels]) return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metrics=eval_metrics) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, predictions={"predictions": logits}, eval_metric_ops=metrics.get_eval_metrics(logits, labels, params)) else: train_op, metric_dict = get_train_op_and_metrics(loss, params) # Epochs can be quite long. This gives some intermediate information # in TensorBoard. metric_dict["minibatch_loss"] = loss if params["use_tpu"]: return tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=tpu_util.construct_scalar_host_call( metric_dict=metric_dict, model_dir=params["model_dir"], prefix="training/") ) record_scalars(metric_dict) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
def train(params): with tf.Graph().as_default(): if tf.train.latest_checkpoint(flags_obj.model_dir): global_step_value = int( tf.train.latest_checkpoint(flags_obj.model_dir).split("-")[-1]) global_step = tf.Variable(initial_value=global_step_value, dtype=tf.int32, trainable=False) print( "right here!", int( tf.train.latest_checkpoint( flags_obj.model_dir).split("-")[-1])) else: global_step_value = 0 global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) learning_rate = get_learning_rate(params.learning_rate, params.hidden_size, params.learning_rate_warmup_steps, global_step) optimizer = tf.contrib.opt.LazyAdamOptimizer( learning_rate, beta1=params.optimizer_adam_beta1, beta2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon) my_dataset = dataset.Dataset(params) train_iterator = my_dataset.train_input_fn(params) valid_iterator = my_dataset.eval_input_fn(params) tower_grads = [] g_model = transformer_9.Transformer(params, is_train=True, mode=None, scope="Transformer") with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): for i in xrange(flags_obj.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: tf.logging.info("Build graph on gpu:{}".format(i)) logits = g_model.inference(train_iterator.source, train_iterator.target) xentropy, weights = metrics.padded_cross_entropy_loss( logits, train_iterator.target, params.label_smoothing, params.target_vocab_size) loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) grads = optimizer.compute_gradients(loss) tf.logging.info( "total trainable variables number: {}".format( len(grads))) tower_grads.append(grads) if i == 0 and valid_iterator: valid_pred = g_model.inference( inputs=valid_iterator.source, targets=None)["outputs"] valid_tgt = valid_iterator.target valid_src = valid_iterator.source if len(tower_grads) > 1: grads = average_gradients(tower_grads) else: grads = tower_grads[0] summaries.append(tf.summary.scalar('learning_rate', learning_rate)) for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram(var.op.name + '/gradients', grad)) apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) for var in tf.trainable_variables(): summaries.append(tf.summary.histogram(var.op.name, var)) train_op = apply_gradient_op saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=20) init = tf.global_variables_initializer() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True with tf.Session(config=sess_config) as sess: sess.run(init) sess.run(tf.local_variables_initializer()) sess.run(train_iterator.initializer) ckpt = tf.train.latest_checkpoint(flags_obj.model_dir) tf.logging.info("ckpt {}".format(ckpt)) if ckpt and tf.train.checkpoint_exists(ckpt): tf.logging.info( "Reloading model parameters..from {}".format(ckpt)) saver.restore(sess, ckpt) else: tf.logging.info("create a new model...{}".format( flags_obj.model_dir)) tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(flags_obj.model_dir, sess.graph) count = 0 best_bleu = 0.0 for step in xrange(global_step_value, flags_obj.train_steps): _, loss_value, lr_value = sess.run( [train_op, loss, learning_rate], feed_dict={g_model.dropout_rate: 0.1}) if step % 200 == 0: tf.logging.info( "step: {}, loss = {:.4f}, lr = {:5f}".format( step, loss_value, lr_value)) assert not np.isnan( loss_value), 'Model diverged with loss = NaN' if step < 10000: steps_between_evals = 2000 else: steps_between_evals = 1000 if step % steps_between_evals == 0: sess.run(valid_iterator.initializer) tf.logging.info( "------------------ Evaluation bleu -------------------------" ) total_bleu = 0.0 total_size = 0 while True: try: val_pred, val_tgt, val_src = sess.run( [valid_pred, valid_tgt, valid_src], feed_dict={g_model.dropout_rate: 0.0}) val_bleu = metrics.compute_bleu(val_tgt, val_pred) batch_size = val_pred.shape[0] total_bleu += val_bleu * batch_size total_size += batch_size except tf.errors.OutOfRangeError: break total_bleu /= total_size tf.logging.info("{}, Step: {}, Valid bleu : {:.6f}".format( datetime.now(), step, total_bleu)) tf.logging.info( "--------------------- Finish evaluation ------------------------" ) # Save the model checkpoint periodically. if step == 0: total_bleu = 0.0 if total_bleu > best_bleu: best_bleu = total_bleu checkpoint_path = os.path.join(flags_obj.model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) tf.logging.info( "Saving model at {}".format(checkpoint_path + "-" + str(step))) elif total_bleu + 0.003 > best_bleu: checkpoint_path = os.path.join(flags_obj.model_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) tf.logging.info( "Saving model at {}".format(checkpoint_path + "-" + str(step))) else: count += 1 # early stop if count > 5: break tf.logging.info("Best bleu is {}".format(best_bleu))
def build_graph(params): my_dataset = dataset.Dataset(params) train_iterator = my_dataset.train_input_fn(params) valid_iterator = my_dataset.eval_input_fn(params) ckpt = tf.train.latest_checkpoint(flags_obj.model_dir) if ckpt and tf.train.checkpoint_exists(ckpt): init_step = int( tf.train.latest_checkpoint(flags_obj.model_dir).split("-")[-1]) global_step = tf.get_variable('global_step', initializer=init_step, trainable=False) else: init_step = 0 global_step = tf.Variable(init_step, trainable=False, name="global_step") learning_rate = get_learning_rate(params.learning_rate, params.hidden_size, params.learning_rate_warmup_steps, global_step) optimizer = tf.contrib.opt.LazyAdamOptimizer( learning_rate, beta1=params.optimizer_adam_beta1, beta2=params.optimizer_adam_beta2, epsilon=params.optimizer_adam_epsilon) tower_grads = [] g_tower_grads = [] g_model = gen_and_dis.Generator(params, is_train=True, name_scope="Transformer") d_model = gen_and_dis.Discriminator(params, is_train=True, name_scope="Discriminator") with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): for i in xrange(flags_obj.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope: tf.logging.info("Build graph on gpu:{}".format(i)) # pretrain loss logits = g_model.inference(train_iterator.source, train_iterator.target) xentropy, weights = metrics.padded_cross_entropy_loss( logits, train_iterator.target, params.label_smoothing, params.target_vocab_size) xen_loss = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) # g_loss gen_samples = g_model.inference(train_iterator.source, None)["outputs"] deal_samples = train_helper._trim_and_pad(gen_samples) given_num, rewards, roll_mean_loss, real_mean_loss = g_model.get_reward( real_inputs=train_iterator.source, real_targets=train_iterator.target, gen_targets=deal_samples, roll_num=flags_obj.roll_num, discriminator=d_model) g_loss = g_model.g_loss(gen_targets=deal_samples, given_num=given_num, rewards=rewards) xen_grads = optimizer.compute_gradients(xen_loss) gen_grads = optimizer.compute_gradients(g_loss) g_grads = [] x_grads = [] for grad, var in gen_grads: if "Transformer" in var.name: g_grads.append((grad, var)) for grad, var in xen_grads: if "Transformer" in var.name: x_grads.append((grad, var)) tf.logging.info( "total trainable variables number: {}, {}".format( len(g_grads), len(x_grads))) tower_grads.append(x_grads) g_tower_grads.append(g_grads) if i == 0 and valid_iterator: val_pred = g_model.inference(inputs=valid_iterator.source, targets=None)["outputs"] if len(tower_grads) > 1: print(len(tower_grads[0]), len(tower_grads[1])) x_grads = train_helper.average_gradients(tower_grads) g_grads = train_helper.average_gradients(g_tower_grads) else: x_grads = tower_grads[0] g_grads = g_tower_grads[0] apply_gradient_op = optimizer.apply_gradients(x_grads, global_step=global_step) g_apply_gradient_op = optimizer.apply_gradients(g_grads, global_step=global_step) train_op = tf.group(apply_gradient_op, g_apply_gradient_op) train_return = (train_op, global_step, g_loss, xen_loss, rewards, learning_rate, init_step, roll_mean_loss, real_mean_loss) valid_return = (val_pred, valid_iterator.target, valid_iterator.source) dataset_iter = (train_iterator, valid_iterator) return g_model, d_model, train_return, valid_return, dataset_iter
def get_mono_loss(logits, labels): xentropy, weights = metrics.padded_cross_entropy_loss( logits, labels, params.label_smoothing, params.target_vocab_size) cross_entropy_mean = tf.reduce_sum(xentropy) / tf.reduce_sum(weights) tf.add_to_collection('mono_losses', cross_entropy_mean) return tf.add_n(tf.get_collection('mono_losses'), name='mono_total_loss')
for i in xrange(train_eval_iterations): gc.collect() print('Starting iteration', i + 1) print('Train:') for step in xrange(single_iteration_train_steps): tic = time() losses = 0 mini_batch_train = dataset_train.get_mini_batch(batch_size=params.batch_size) input = gluon.utils.split_and_load(mini_batch_train['input'], ctx) targets = gluon.utils.split_and_load(mini_batch_train['targets'], ctx) global_step = 1 + global_step learning_rate = get_learning_rate(params.learning_rate, params.hidden_size, params.learning_rate_warmup_steps, global_step) with autograd.record(): for j in xrange(num_gpu): loss = metrics.padded_cross_entropy_loss(net(input[j], targets[j]), targets[j], params.label_smoothing, params.vocab_size) loss.backward() losses = losses + loss trainer.set_learning_rate(learning_rate) trainer.step(params.batch_size) mx.ndarray.waitall() print("\t step %d: Loss: %.3f, Time:%.1f seconds" % (global_step, losses.mean().asscalar() / 4, time() - tic)) print('Evaluate: ') uncased_score = translate_and_compute_bleu(net, subtokenizer, bleu_source, bleu_ref) print('\t uncased_score: %.3f' % uncased_score) print('\t best_bleu_score: %.3f' % best_bleu_score) if uncased_score > best_bleu_score: best_bleu_score = uncased_score