Exemplo n.º 1
0
 def nth_model(n):
     """Build the model for the n-th problem, plus some added variables."""
     model_class = registry.model(model)(
         my_hp, mode, my_hp.problems[n], n, dp,
         devices.ps_devices(all_workers=True))
     if mode == tf.contrib.learn.ModeKeys.INFER:
         return model_class.infer(
             features,
             beam_size=FLAGS.decode_beam_size,
             top_beams=(FLAGS.decode_beam_size
                        if FLAGS.decode_return_beams else 1),
             last_position_only=FLAGS.decode_use_last_position_only,
             alpha=FLAGS.decode_alpha,
             decode_length=FLAGS.decode_extra_length)
     # In distributed mode, we build graph for problem=0 and problem=worker_id.
     skipping_is_on = my_hp.problem_choice == "distributed" and train
     problem_worker_id = FLAGS.worker_id % len(my_hp.problems)
     skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id
     # On worker 0 also build graph for problems <= 1.
     # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
     skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
     if (FLAGS.eval_run_autoregressive
             and mode == tf.contrib.learn.ModeKeys.EVAL):
         sharded_logits, losses_dict = model_class.eval_autoregressive(
             features)
     else:
         sharded_logits, losses_dict = model_class.model_fn(
             features, skip=(skipping_is_on and skip_this_one))
     with tf.variable_scope("losses_avg"):
         total_loss, ops = 0.0, []
         for loss_key, loss_value in six.iteritems(losses_dict):
             loss_name = "problem_%d/%s_loss" % (n, loss_key)
             loss_moving_avg = tf.get_variable(loss_name,
                                               initializer=100.0,
                                               trainable=False)
             loss_variable_names.append(loss_name)
             ops.append(
                 loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                        loss_value * 0.1))
             total_loss += loss_value
         with tf.variable_scope(tf.get_variable_scope(), reuse=True):
             # Total loss was already constructed on input.
             loss_moving_avg = tf.get_variable("problem_%d/total_loss" %
                                               n)
         ops.append(
             loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                    total_loss * 0.1))
     with tf.variable_scope(
             "train_stats"):  # Count steps for this problem.
         problem_steps = tf.get_variable("problem_%d_steps" % n,
                                         initializer=0,
                                         trainable=False)
         ops.append(problem_steps.assign_add(1))
     with tf.control_dependencies(ops):  # Make sure the ops run.
         # Ensure the loss is a scalar here.
         total_loss = tf.reshape(total_loss, [],
                                 name="total_loss_control_id")
     return [total_loss
             ] + sharded_logits  # Need to flatten for cond later.
Exemplo n.º 2
0
 def nth_model(n):
   """Build the model for the n-th problem, plus some added variables."""
   model_class = registry.model(model)(
       hparams,
       mode,
       hparams.problems[n],
       n,
       dp,
       devices.ps_devices(all_workers=True),
       decode_hparams=decode_hparams)
   if mode == tf.estimator.ModeKeys.PREDICT:
     return model_class.infer(
         features,
         beam_size=decode_hp.beam_size,
         top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1),
         alpha=decode_hp.alpha,
         decode_length=decode_hp.extra_length)
   # In distributed mode, we build graph for problem=0 and problem=worker_id.
   skipping_is_on = hparams.problem_choice == "distributed" and is_training
   problem_worker_id = worker_id % len(hparams.problems)
   skip_this_one = n != 0 and n % worker_replicas != problem_worker_id
   # On worker 0 also build graph for problems <= 1.
   # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
   skip_this_one = skip_this_one and (worker_id != 0 or n > 1)
   if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL:
     logits, losses_dict = model_class.eval_autoregressive(features)
   else:
     logits, losses_dict = model_class(
         features, skip=(skipping_is_on and skip_this_one))
   with tf.variable_scope("losses_avg"):
     total_loss, ops = 0.0, []
     for loss_key, loss_value in six.iteritems(losses_dict):
       loss_name = "problem_%d/%s_loss" % (n, loss_key)
       loss_moving_avg = tf.get_variable(
           loss_name, initializer=100.0, trainable=False)
       loss_variable_names.append(loss_name)
       ops.append(
           loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1))
       total_loss += loss_value
     try:  # Total loss avg might be reused or not, we try both.
       with tf.variable_scope(tf.get_variable_scope(), reuse=True):
         # Total loss was already constructed on input.
         loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
     except ValueError:
       loss_moving_avg = tf.get_variable(
           "problem_%d/total_loss" % n, initializer=100.0, trainable=False)
     ops.append(
         loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1))
   with tf.variable_scope("train_stats"):  # Count steps for this problem.
     problem_steps = tf.get_variable(
         "problem_%d_steps" % n, initializer=0, trainable=False)
     ops.append(problem_steps.assign_add(1))
   with tf.control_dependencies(ops):  # Make sure the ops run.
     # Ensure the loss is a scalar here.
     total_loss = tf.reshape(total_loss, [], name="total_loss_control_id")
   return [total_loss, logits]
    def __init__(self,
                 t2t_usr_dir,
                 src_vocab_size,
                 trg_vocab_size,
                 model_name,
                 problem_name,
                 hparams_set_name,
                 checkpoint_dir,
                 t2t_unk_id=None,
                 single_cpu_thread=False):
        """Creates a new T2T predictor. The constructor prepares the
        TensorFlow session for predict_next() calls. This includes:
        - Load hyper parameters from the given set (hparams)
        - Update registry, load T2T model
        - Create TF placeholders for source sequence and target pefix
        - Create computation graph for computing log probs.
        - Create a MonitoredSession object, which also handles 
          restoring checkpoints.
        
        Args:
            t2t_usr_dir (string): See --t2t_usr_dir in tensor2tensor.
            src_vocab_size (int): Source vocabulary size.
            trg_vocab_size (int): Target vocabulary size.
            model_name (string): T2T model name.
            problem_name (string): T2T problem name.
            hparams_set_name (string): T2T hparams set name.
            checkpoint_dir (string): Path to the T2T checkpoint 
                                     directory. The predictor will load
                                     the top most checkpoint in the 
                                     `checkpoints` file.
            t2t_unk_id (int): If set, use this ID to get UNK scores. If
                              None, UNK is always scored with -inf.
            single_cpu_thread (bool): If true, prevent tensorflow from
                                      doing multithreading.
        """
        super(T2TPredictor, self).__init__(t2t_usr_dir, checkpoint_dir,
                                           t2t_unk_id, single_cpu_thread)
        self.consumed = []
        self.src_sentence = []
        predictor_graph = tf.Graph()
        with predictor_graph.as_default() as g:
            hparams = self._create_hparams(src_vocab_size, trg_vocab_size,
                                           hparams_set_name, problem_name)
            p_hparams = hparams.problems[0]
            self._inputs_var = tf.placeholder(dtype=tf.int32,
                                              shape=[None],
                                              name="sgnmt_inputs")
            self._targets_var = tf.placeholder(dtype=tf.int32,
                                               shape=[None],
                                               name="sgnmt_targets")

            def expand_input_dims_for_t2t(t):
                t = tf.expand_dims(t, 0)  # Because of batch_size
                t = tf.expand_dims(t, -1)  # Because of modality
                t = tf.expand_dims(t, -1)  # Because of random reason X
                return t

            features = {
                "problem_choice": tf.constant(0),
                "input_space_id": tf.constant(p_hparams.input_space_id),
                "target_space_id": tf.constant(p_hparams.target_space_id),
                "inputs": expand_input_dims_for_t2t(self._inputs_var),
                "targets": expand_input_dims_for_t2t(self._targets_var)
            }

            model = registry.model(model_name)(
                hparams, tf.estimator.ModeKeys.PREDICT, hparams.problems[0], 0,
                devices.data_parallelism(),
                devices.ps_devices(all_workers=True))
            sharded_logits, _ = model.model_fn(features,
                                               last_position_only=True)
            self._log_probs = log_prob_from_logits(sharded_logits[0])
            self.mon_sess = self.create_session()
Exemplo n.º 4
0
    def __init__(self,
                 src_vocab_size,
                 trg_vocab_size,
                 model_name,
                 problem_name,
                 hparams_set_name,
                 t2t_usr_dir,
                 checkpoint_dir,
                 t2t_unk_id=None,
                 single_cpu_thread=False,
                 max_terminal_id=-1,
                 pop_id=-1):
        """Creates a new simultaneous T2T predictor. The constructor prepares
        the TensorFlow session for predict_next() calls. This includes:
        - Load hyper parameters from the given set (hparams)
        - Update registry, load T2T model
        - Create TF placeholders for source sequence and target prefix
        - Create computation graph for computing log probs.
        - Create a MonitoredSession object, which also handles
          restoring checkpoints.

        Args:
            src_vocab_size (int): Source vocabulary size.
            trg_vocab_size (int): Target vocabulary size.
            model_name (string): T2T model name.
            problem_name (string): T2T problem name.
            hparams_set_name (string): T2T hparams set name.
            t2t_usr_dir (string): See --t2t_usr_dir in tensor2tensor.
            checkpoint_dir (string): Path to the T2T checkpoint
                                     directory. The predictor will load
                                     the top most checkpoint in the
                                     `checkpoints` file.
            t2t_unk_id (int): If set, use this ID to get UNK scores. If
                              None, UNK is always scored with -inf.
            single_cpu_thread (bool): If true, prevent tensorflow from
                                      doing multithreading.
            max_terminal_id (int): If positive, maximum terminal ID. Needs to
                be set for syntax-based T2T models.
            pop_id (int): If positive, ID of the POP or closing bracket symbol.
                Needs to be set for syntax-based T2T models.
        """
        super(SimT2TPredictor_v2, self).__init__(t2t_usr_dir, checkpoint_dir,
                                                 t2t_unk_id, single_cpu_thread)
        self.consumed = []
        self.src_sentence = []
        self.pop_id = pop_id
        self.max_terminal_id = max_terminal_id
        self.previous_encode = -1
        self.previous_decode = -1
        predictor_graph = tf.Graph()
        with predictor_graph.as_default() as g:
            hparams = self._create_hparams(src_vocab_size, trg_vocab_size,
                                           hparams_set_name, problem_name)
            p_hparams = hparams.problems[0]
            self._inputs_var = tf.placeholder(dtype=tf.int32,
                                              shape=[None],
                                              name="sgnmt_inputs")
            self._targets_var = tf.placeholder(dtype=tf.int32,
                                               shape=[None],
                                               name="sgnmt_targets")
            features = {
                "problem_choice": tf.constant(0),
                "input_space_id": tf.constant(p_hparams.input_space_id),
                "target_space_id": tf.constant(p_hparams.target_space_id),
                "inputs": expand_input_dims_for_t2t(self._inputs_var),
                "targets": expand_input_dims_for_t2t(self._targets_var)
            }

            model = registry.model(model_name)(
                hparams, tf.estimator.ModeKeys.PREDICT, hparams.problems[0], 0,
                devices.data_parallelism(),
                devices.ps_devices(all_workers=True))
            sharded_logits, _ = model.model_fn(features)
            self._log_probs = log_prob_from_logits(sharded_logits[0])
            self._encoder_output = model.encoder_output
            self._encoder_decoder_attention_bias = model.attention_bias
            self._decoder_output = model.decoder_output

            self.mon_sess = self.create_session()
Exemplo n.º 5
0
 def nth_model(n):
     """Build the model for the n-th problem, plus some added variables."""
     model_class = registry.model(model)(
         hparams,
         mode,
         hparams.problems[n],
         n,
         dp,
         devices.ps_devices(all_workers=True),
         decode_hparams=decode_hparams
     )  # initialize transformer model class: hparams, modalities
     if mode == tf.estimator.ModeKeys.PREDICT:
         return model_class.infer(features,
                                  beam_size=decode_hp.beam_size,
                                  top_beams=(decode_hp.beam_size if
                                             decode_hp.return_beams else 1),
                                  alpha=decode_hp.alpha,
                                  decode_length=decode_hp.extra_length)
     # In distributed mode, we build graph for problem=0 and problem=worker_id.
     skipping_is_on = hparams.problem_choice == "distributed" and is_training
     problem_worker_id = worker_id % len(hparams.problems)
     skip_this_one = n != 0 and n % worker_replicas != problem_worker_id
     # On worker 0 also build graph for problems <= 1.
     # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
     skip_this_one = skip_this_one and (worker_id != 0 or n > 1)
     mrt_samples = getattr(hparams, 'mrt_samples', None)
     if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL:  # evaluation mode
         sharded_logits, losses_dict = model_class.eval_autoregressive(
             features)
     else:  # training mode
         if hparams.rl:
             # generate sample data, it will automatically sharded, samples shape [batch, time, 1, 1]
             if model_class._num_datashards == 1:  # work on single GPU cards, fast sample
                 print("###Work on Single GPU card, Use Fast Decode.###")
                 train_beam = getattr(hparams, 'train_beam', None)
                 if mrt_samples:
                     samples, _ = model_class._fast_decode(
                         features,
                         decode_length=50,
                         beam_size=mrt_samples,
                         top_beams=mrt_samples)
                     inputs = tf.squeeze(tf.squeeze(features["inputs"],
                                                    axis=-1),
                                         axis=-1)
                     targets = tf.squeeze(tf.squeeze(features["targets"],
                                                     axis=-1),
                                          axis=-1)
                     batch_size = tf.shape(inputs)[0]
                     inputs_len = tf.shape(inputs)[1]
                     targets_len = tf.shape(targets)[1]
                     inputs_tile = tf.tile(inputs, [1, mrt_samples])
                     targets_tile = tf.tile(targets, [1, mrt_samples])
                     inputs_reshape = tf.reshape(
                         inputs_tile,
                         [batch_size * mrt_samples, inputs_len])
                     targets_reshape = tf.reshape(
                         targets_tile,
                         [batch_size * mrt_samples, targets_len])
                     inputs_feed = tf.expand_dims(tf.expand_dims(
                         inputs_reshape, axis=-1),
                                                  axis=-1)
                     targets_feed = tf.expand_dims(tf.expand_dims(
                         targets_reshape, axis=-1),
                                                   axis=-1)
                     features["inputs"] = inputs_feed
                     features["targets"] = targets_feed
                 elif train_beam and train_beam != 1:  # beam search with hparams.train_beam size and return the top 1 sample
                     samples, _ = model_class._fast_decode(
                         features,
                         decode_length=50,
                         beam_size=hparams.train_beam)
                 else:
                     targets_beam = getattr(hparams, 'targets_beam', None)
                     if targets_beam:
                         targets_samples, _ = model_class._fast_decode(
                             features,
                             decode_length=50,
                             beam_size=4,
                             sampling_method='argmax')
                         targets_samples = tf.reshape(
                             targets_samples, [
                                 tf.shape(targets_samples)[0],
                                 tf.shape(targets_samples)[1], 1, 1
                             ])
                         features["targets"] = targets_samples
                     samples, _ = model_class._fast_decode(features,
                                                           decode_length=50)
                 samples = tf.expand_dims(samples, axis=-1)
                 samples = tf.expand_dims(
                     samples, axis=-1
                 )  # add two additional dimensions to make it compatible.
             else:  # work on multi GPU cards, only support slow sample
                 print("###Work on Multi GPU cards, Use Slow Decode.###")
                 samples, _, _ = model_class._slow_greedy_infer(
                     features,
                     decode_length=50)  # default decode_length = 50
             samples = tf.stop_gradient(samples)
             # calculate bleu score use metric_fn
             # train_metric_fn = "approx_bleu_train_score"
             train_metric_fn = metrics.METRICS_FNS[
                 metrics.Metrics.APPROX_BLEU_TRAIN]
             labels = features.get("targets", None)
             samples.set_shape([None, None, 1, 1])
             # haprams.delta_reward = True for delta reward; False for total reward
             metric_value = train_metric_fn(
                 samples, labels, delat_reward=hparams.delta_reward)
             metric_value = tf.stop_gradient(
                 metric_value)  # to be more strict of the gradient
             metric_value.set_shape([None, None, 1, 1])
             """Accodring to the metrics.py: The tf.metrics.mean function assures correct aggregation."""
             # metric_value is total_reward: scalar
             features["samples"] = samples
             features["values"] = metric_value
             # del samples
             # del labels
         sharded_logits, losses_dict = model_class.model_fn(
             features,
             skip=(skipping_is_on and skip_this_one),
             mrt=mrt_samples)
         # if hparams.rl:
         #     training_loss = losses_dict["training"] * metric_value  # losses_dict["training"]: [batch, timesteps]
         #     training_loss_sum = tf.reduce_sum(training_loss)  # sum the training_loss
         #     losses_dict["training"] = training_loss_sum  # log_prob * r (current r is total_reward)
     with tf.variable_scope("losses_avg"):
         total_loss, ops = 0.0, []
         for loss_key, loss_value in six.iteritems(losses_dict):
             if hparams.rl:
                 baseline_loss_weight = getattr(hparams,
                                                'baseline_loss_weight', 1.0)
                 training_loss_weight = getattr(hparams,
                                                'training_loss_weight', 1.0)
                 mle_training_loss_weight = getattr(
                     hparams, 'mle_training_loss_weight', 0.3)
                 if loss_key == "training":
                     loss_value = loss_value * training_loss_weight
                 elif loss_key == "training_baseline":
                     loss_value = loss_value * baseline_loss_weight
                 elif loss_key == "mle_training":
                     loss_value = loss_value * mle_training_loss_weight
             loss_name = "problem_%d/%s_loss" % (n, loss_key)
             loss_moving_avg = tf.get_variable(loss_name,
                                               initializer=100.0,
                                               trainable=False)
             loss_variable_names.append(loss_name)
             ops.append(
                 loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                        loss_value * 0.1))
             total_loss += loss_value
         try:  # Total loss avg might be reused or not, we try both.
             with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                 # Total loss was already constructed on input.
                 loss_moving_avg = tf.get_variable("problem_%d/total_loss" %
                                                   n)
         except ValueError:
             loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n,
                                               initializer=100.0,
                                               trainable=False)
         ops.append(
             loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                    total_loss * 0.1))
     with tf.variable_scope("train_stats"):  # Count steps for this problem.
         problem_steps = tf.get_variable("problem_%d_steps" % n,
                                         initializer=0,
                                         trainable=False)
         ops.append(problem_steps.assign_add(1))
     with tf.control_dependencies(ops):  # Make sure the ops run.
         # Ensure the loss is a scalar here.
         total_loss = tf.reshape(total_loss, [],
                                 name="total_loss_control_id")
     return [total_loss, tf.concat(sharded_logits, 0)]