Exemplo n.º 1
0
def create_experiment_components(hparams, output_dir, data_dir, model_name):
  """Constructs and returns Estimator and train/eval input functions."""
  tf.logging.info("Creating experiment, storing model files in %s", output_dir)

  num_datashards = devices.data_parallelism().n
  train_input_fn = input_fn_builder.build_input_fn(
      mode=tf.contrib.learn.ModeKeys.TRAIN,
      hparams=hparams,
      data_file_patterns=get_data_filepatterns(data_dir,
                                               tf.contrib.learn.ModeKeys.TRAIN),
      num_datashards=num_datashards)

  eval_input_fn = input_fn_builder.build_input_fn(
      mode=tf.contrib.learn.ModeKeys.EVAL,
      hparams=hparams,
      data_file_patterns=get_data_filepatterns(data_dir,
                                               tf.contrib.learn.ModeKeys.EVAL),
      num_datashards=num_datashards)
  estimator = tf.contrib.learn.Estimator(
      model_fn=model_builder.build_model_fn(model_name, hparams=hparams),
      model_dir=output_dir,
      config=tf.contrib.learn.RunConfig(
          master=FLAGS.master,
          model_dir=output_dir,
          gpu_memory_fraction=FLAGS.worker_gpu_memory_fraction,
          session_config=session_config(),
          keep_checkpoint_max=FLAGS.keep_checkpoint_max))
  # Store the hparams in the estimator as well
  estimator.hparams = hparams
  return estimator, {
      tf.contrib.learn.ModeKeys.TRAIN: train_input_fn,
      tf.contrib.learn.ModeKeys.EVAL: eval_input_fn
  }
Exemplo n.º 2
0
def create_experiment_components(hparams, output_dir, data_dir, model_name):
    """Constructs and returns Estimator and train/eval input functions."""
    tf.logging.info("Creating experiment, storing model files in %s",
                    output_dir)

    num_datashards = devices.data_parallelism().n
    train_input_fn = input_fn_builder.build_input_fn(
        mode=tf.estimator.ModeKeys.TRAIN,
        hparams=hparams,
        data_file_patterns=get_data_filepatterns(data_dir,
                                                 tf.estimator.ModeKeys.TRAIN),
        num_datashards=num_datashards,
        worker_replicas=FLAGS.worker_replicas,
        worker_id=FLAGS.worker_id)

    eval_input_fn = input_fn_builder.build_input_fn(
        mode=tf.estimator.ModeKeys.EVAL,
        hparams=hparams,
        data_file_patterns=get_data_filepatterns(data_dir,
                                                 tf.estimator.ModeKeys.EVAL),
        num_datashards=num_datashards,
        worker_replicas=FLAGS.worker_replicas,
        worker_id=FLAGS.worker_id)

    autotune = False
    objective = None
    if hasattr(FLAGS, "autotune"):
        autotune = FLAGS.autotune
        objective = FLAGS.objective
    model_fn = model_builder.build_model_fn(
        model_name,
        problem_names=FLAGS.problems.split("-"),
        train_steps=FLAGS.train_steps,
        worker_id=FLAGS.worker_id,
        worker_replicas=FLAGS.worker_replicas,
        eval_run_autoregressive=FLAGS.eval_run_autoregressive,
        decode_hparams=decoding.decode_hparams(FLAGS.decode_hparams),
        autotune=autotune,
        objective=objective)
    estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        model_dir=output_dir,
        params=hparams,
        config=tf.contrib.learn.RunConfig(
            master=FLAGS.master,
            gpu_memory_fraction=FLAGS.worker_gpu_memory_fraction,
            session_config=session_config(),
            keep_checkpoint_max=FLAGS.keep_checkpoint_max,
            keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours,
            save_checkpoints_secs=FLAGS.save_checkpoints_secs))

    return estimator, {
        tf.estimator.ModeKeys.TRAIN: train_input_fn,
        tf.estimator.ModeKeys.EVAL: eval_input_fn
    }
Exemplo n.º 3
0
def create_experiment_components(data_dir, model_name, hparams, run_config):
    """Constructs and returns Estimator and train/eval input functions."""
    tf.logging.info("Creating experiment, storing model files in %s",
                    run_config.model_dir)

    add_problem_hparams(hparams, FLAGS.problems)

    # hparams batch_size is used as minibatch size instead of tokens in batch
    batch_size = (hparams.use_fixed_batch_size and hparams.batch_size) or None
    num_datashards = devices.data_parallelism().n
    train_input_fn = input_fn_builder.build_input_fn(
        mode=tf.estimator.ModeKeys.TRAIN,
        hparams=hparams,
        data_dir=data_dir,
        num_datashards=num_datashards,
        worker_replicas=FLAGS.worker_replicas,
        worker_id=FLAGS.worker_id,
        batch_size=batch_size)  # return feature_map, feature_map["targets"]

    eval_input_fn = input_fn_builder.build_input_fn(
        mode=tf.estimator.ModeKeys.EVAL,
        hparams=hparams,
        data_dir=data_dir,
        num_datashards=num_datashards,
        worker_replicas=FLAGS.worker_replicas,
        worker_id=FLAGS.worker_id,
        dataset_split="test"
        if FLAGS.eval_use_test_set else None)  # evaluate on test dataset
    # input_fn return feature_map

    model_fn = model_builder.build_model_fn(
        model_name,
        problem_names=FLAGS.problems.split("-"),
        train_steps=FLAGS.train_steps,
        worker_id=FLAGS.worker_id,
        worker_replicas=FLAGS.worker_replicas,
        eval_run_autoregressive=FLAGS.eval_run_autoregressive,
        decode_hparams=decoding.decode_hparams(FLAGS.decode_hparams))

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=run_config.model_dir,
                                       params=hparams,
                                       config=run_config)

    return estimator, {
        tf.estimator.ModeKeys.TRAIN: train_input_fn,
        tf.estimator.ModeKeys.EVAL: eval_input_fn
    }
Exemplo n.º 4
0
def create_experiment_components(data_dir, model_name, hparams, run_config):
  """Constructs and returns Estimator and train/eval input functions."""
  tf.logging.info("Creating experiment, storing model files in %s",
                  run_config.model_dir)

  add_problem_hparams(hparams, FLAGS.problems)

  # hparams batch_size is used as minibatch size instead of tokens in batch
  batch_size = (hparams.use_fixed_batch_size and hparams.batch_size) or None
  num_datashards = devices.data_parallelism(hparams).n
  train_input_fn = input_fn_builder.build_input_fn(
      mode=tf.estimator.ModeKeys.TRAIN,
      hparams=hparams,
      data_dir=data_dir,
      num_datashards=num_datashards,
      worker_replicas=FLAGS.worker_replicas,
      worker_id=FLAGS.worker_id,
      batch_size=batch_size)

  eval_input_fn = input_fn_builder.build_input_fn(
      mode=tf.estimator.ModeKeys.EVAL,
      hparams=hparams,
      data_dir=data_dir,
      num_datashards=num_datashards,
      worker_replicas=FLAGS.worker_replicas,
      worker_id=FLAGS.worker_id,
      dataset_split="test" if FLAGS.eval_use_test_set else None)

  model_fn = model_builder.build_model_fn(
      model_name,
      problem_names=FLAGS.problems.split("-"),
      train_steps=FLAGS.train_steps,
      worker_id=FLAGS.worker_id,
      worker_replicas=FLAGS.worker_replicas,
      eval_run_autoregressive=FLAGS.eval_run_autoregressive,
      decode_hparams=decoding.decode_hparams(FLAGS.decode_hparams))

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      model_dir=run_config.model_dir,
      params=hparams,
      config=run_config)

  return estimator, {
      tf.estimator.ModeKeys.TRAIN: train_input_fn,
      tf.estimator.ModeKeys.EVAL: eval_input_fn
  }
Exemplo n.º 5
0
def create_experiment_components(data_dir, model_name, hparams, run_config):
    """Constructs and returns Estimator and train/eval input functions."""
    tf.logging.info("Creating experiment, storing model files in %s",
                    run_config.model_dir)

    hparams = add_problem_hparams(hparams, FLAGS.problems)

    num_datashards = devices.data_parallelism().n
    train_input_fn = input_fn_builder.build_input_fn(
        mode=tf.estimator.ModeKeys.TRAIN,
        hparams=hparams,
        data_file_patterns=get_data_filepatterns(data_dir,
                                                 tf.estimator.ModeKeys.TRAIN),
        num_datashards=num_datashards,
        worker_replicas=FLAGS.worker_replicas,
        worker_id=FLAGS.worker_id)

    eval_input_fn = input_fn_builder.build_input_fn(
        mode=tf.estimator.ModeKeys.EVAL,
        hparams=hparams,
        data_file_patterns=get_data_filepatterns(data_dir,
                                                 tf.estimator.ModeKeys.EVAL),
        num_datashards=num_datashards,
        worker_replicas=FLAGS.worker_replicas,
        worker_id=FLAGS.worker_id)

    model_fn = model_builder.build_model_fn(
        model_name,
        problem_names=FLAGS.problems.split("-"),
        train_steps=FLAGS.train_steps,
        worker_id=FLAGS.worker_id,
        worker_replicas=FLAGS.worker_replicas,
        eval_run_autoregressive=FLAGS.eval_run_autoregressive,
        decode_hparams=decoding.decode_hparams(FLAGS.decode_hparams))

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       model_dir=run_config.model_dir,
                                       params=hparams,
                                       config=run_config)

    return estimator, {
        tf.estimator.ModeKeys.TRAIN: train_input_fn,
        tf.estimator.ModeKeys.EVAL: eval_input_fn
    }
Exemplo n.º 6
0
def create_run_config(master="",
                      model_dir=None,
                      iterations_per_loop=1000,
                      num_shards=8,
                      log_device_placement=False,
                      save_checkpoints_steps=1000,
                      keep_checkpoint_max=20,
                      keep_checkpoint_every_n_hours=10000,
                      num_gpus=1,
                      gpu_order="",
                      shard_to_cpu=False,
                      num_async_replicas=1,
                      enable_graph_rewriter=False,
                      gpu_mem_fraction=0.95,
                      no_data_parallelism=False,
                      daisy_chain_variables=True,
                      schedule="continuous_train_and_eval",
                      worker_job="/job:localhost",
                      worker_id=0,
                      ps_replicas=0,
                      ps_job="/job:ps",
                      ps_gpu=0,
                      sync=False,
                      use_tpu=False):
    """Create RunConfig, TPUConfig, and Parallelism object."""
    session_config = create_session_config(
        log_device_placement=log_device_placement,
        enable_graph_rewriter=enable_graph_rewriter,
        gpu_mem_fraction=gpu_mem_fraction,
        use_tpu=use_tpu)
    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=log_device_placement)
    run_config_args = {
        "master": master,
        "model_dir": model_dir,
        "session_config": session_config,
        "save_summary_steps": 0,
        "save_checkpoints_steps": save_checkpoints_steps,
        "keep_checkpoint_max": keep_checkpoint_max,
        "keep_checkpoint_every_n_hours": keep_checkpoint_every_n_hours,
    }
    run_config_cls = tf.contrib.learn.RunConfig

    # If using TPU, use TPU RunConfig, add TPUConfig, and add additional args
    if use_tpu:
        run_config_cls = tf.contrib.tpu.RunConfig
        tpu_config = tf.contrib.tpu.TPUConfig(
            iterations_per_loop=iterations_per_loop,
            num_shards=num_shards,
            per_host_input_for_training=(num_shards <= 8))
        run_config_args["tpu_config"] = tpu_config

    config = run_config_cls(**run_config_args)

    # If not using TPU, add device info for data_parallelism
    config.use_tpu = use_tpu
    if not use_tpu:
        config.t2t_device_info = {
            "num_async_replicas": num_async_replicas,
        }
        if no_data_parallelism:
            config.data_parallelism = expert_utils.Parallelism([""])
        else:
            config.data_parallelism = devices.data_parallelism(
                daisy_chain_variables=daisy_chain_variables,
                ps_replicas=ps_replicas,
                ps_job=ps_job,
                ps_gpu=ps_gpu,
                schedule=schedule,
                sync=sync,
                worker_gpu=num_gpus,
                worker_replicas=num_async_replicas,
                worker_id=worker_id,
                gpu_order=gpu_order,
                locally_shard_to_cpu=shard_to_cpu,
                worker_job=worker_job)

    return config
Exemplo n.º 7
0
    def __init__(self,
                 src_vocab_size,
                 trg_vocab_size,
                 model_name,
                 problem_name,
                 hparams_set_name,
                 t2t_usr_dir,
                 checkpoint_dir,
                 t2t_unk_id=None,
                 single_cpu_thread=False,
                 max_terminal_id=-1,
                 pop_id=-1):
        """Creates a new simultaneous T2T predictor. The constructor prepares
        the TensorFlow session for predict_next() calls. This includes:
        - Load hyper parameters from the given set (hparams)
        - Update registry, load T2T model
        - Create TF placeholders for source sequence and target prefix
        - Create computation graph for computing log probs.
        - Create a MonitoredSession object, which also handles
          restoring checkpoints.

        Args:
            src_vocab_size (int): Source vocabulary size.
            trg_vocab_size (int): Target vocabulary size.
            model_name (string): T2T model name.
            problem_name (string): T2T problem name.
            hparams_set_name (string): T2T hparams set name.
            t2t_usr_dir (string): See --t2t_usr_dir in tensor2tensor.
            checkpoint_dir (string): Path to the T2T checkpoint
                                     directory. The predictor will load
                                     the top most checkpoint in the
                                     `checkpoints` file.
            t2t_unk_id (int): If set, use this ID to get UNK scores. If
                              None, UNK is always scored with -inf.
            single_cpu_thread (bool): If true, prevent tensorflow from
                                      doing multithreading.
            max_terminal_id (int): If positive, maximum terminal ID. Needs to
                be set for syntax-based T2T models.
            pop_id (int): If positive, ID of the POP or closing bracket symbol.
                Needs to be set for syntax-based T2T models.
        """
        super(SimT2TPredictor_v2, self).__init__(t2t_usr_dir, checkpoint_dir,
                                                 t2t_unk_id, single_cpu_thread)
        self.consumed = []
        self.src_sentence = []
        self.pop_id = pop_id
        self.max_terminal_id = max_terminal_id
        self.previous_encode = -1
        self.previous_decode = -1
        predictor_graph = tf.Graph()
        with predictor_graph.as_default() as g:
            hparams = self._create_hparams(src_vocab_size, trg_vocab_size,
                                           hparams_set_name, problem_name)
            p_hparams = hparams.problems[0]
            self._inputs_var = tf.placeholder(dtype=tf.int32,
                                              shape=[None],
                                              name="sgnmt_inputs")
            self._targets_var = tf.placeholder(dtype=tf.int32,
                                               shape=[None],
                                               name="sgnmt_targets")
            features = {
                "problem_choice": tf.constant(0),
                "input_space_id": tf.constant(p_hparams.input_space_id),
                "target_space_id": tf.constant(p_hparams.target_space_id),
                "inputs": expand_input_dims_for_t2t(self._inputs_var),
                "targets": expand_input_dims_for_t2t(self._targets_var)
            }

            model = registry.model(model_name)(
                hparams, tf.estimator.ModeKeys.PREDICT, hparams.problems[0], 0,
                devices.data_parallelism(),
                devices.ps_devices(all_workers=True))
            sharded_logits, _ = model.model_fn(features)
            self._log_probs = log_prob_from_logits(sharded_logits[0])
            self._encoder_output = model.encoder_output
            self._encoder_decoder_attention_bias = model.attention_bias
            self._decoder_output = model.decoder_output

            self.mon_sess = self.create_session()
    def __init__(self,
                 t2t_usr_dir,
                 src_vocab_size,
                 trg_vocab_size,
                 model_name,
                 problem_name,
                 hparams_set_name,
                 checkpoint_dir,
                 t2t_unk_id=None,
                 single_cpu_thread=False):
        """Creates a new T2T predictor. The constructor prepares the
        TensorFlow session for predict_next() calls. This includes:
        - Load hyper parameters from the given set (hparams)
        - Update registry, load T2T model
        - Create TF placeholders for source sequence and target pefix
        - Create computation graph for computing log probs.
        - Create a MonitoredSession object, which also handles 
          restoring checkpoints.
        
        Args:
            t2t_usr_dir (string): See --t2t_usr_dir in tensor2tensor.
            src_vocab_size (int): Source vocabulary size.
            trg_vocab_size (int): Target vocabulary size.
            model_name (string): T2T model name.
            problem_name (string): T2T problem name.
            hparams_set_name (string): T2T hparams set name.
            checkpoint_dir (string): Path to the T2T checkpoint 
                                     directory. The predictor will load
                                     the top most checkpoint in the 
                                     `checkpoints` file.
            t2t_unk_id (int): If set, use this ID to get UNK scores. If
                              None, UNK is always scored with -inf.
            single_cpu_thread (bool): If true, prevent tensorflow from
                                      doing multithreading.
        """
        super(T2TPredictor, self).__init__(t2t_usr_dir, checkpoint_dir,
                                           t2t_unk_id, single_cpu_thread)
        self.consumed = []
        self.src_sentence = []
        predictor_graph = tf.Graph()
        with predictor_graph.as_default() as g:
            hparams = self._create_hparams(src_vocab_size, trg_vocab_size,
                                           hparams_set_name, problem_name)
            p_hparams = hparams.problems[0]
            self._inputs_var = tf.placeholder(dtype=tf.int32,
                                              shape=[None],
                                              name="sgnmt_inputs")
            self._targets_var = tf.placeholder(dtype=tf.int32,
                                               shape=[None],
                                               name="sgnmt_targets")

            def expand_input_dims_for_t2t(t):
                t = tf.expand_dims(t, 0)  # Because of batch_size
                t = tf.expand_dims(t, -1)  # Because of modality
                t = tf.expand_dims(t, -1)  # Because of random reason X
                return t

            features = {
                "problem_choice": tf.constant(0),
                "input_space_id": tf.constant(p_hparams.input_space_id),
                "target_space_id": tf.constant(p_hparams.target_space_id),
                "inputs": expand_input_dims_for_t2t(self._inputs_var),
                "targets": expand_input_dims_for_t2t(self._targets_var)
            }

            model = registry.model(model_name)(
                hparams, tf.estimator.ModeKeys.PREDICT, hparams.problems[0], 0,
                devices.data_parallelism(),
                devices.ps_devices(all_workers=True))
            sharded_logits, _ = model.model_fn(features,
                                               last_position_only=True)
            self._log_probs = log_prob_from_logits(sharded_logits[0])
            self.mon_sess = self.create_session()
Exemplo n.º 9
0
    def model_fn(features, labels, mode, params):
        """Creates the prediction, loss, and train ops.

    Args:
      features: A dictionary of tensors keyed by the feature name.
      labels: A tensor representing the labels.
      mode: The execution mode, as defined in tf.estimator.ModeKeys.
      params: model HParams.

    Returns:
      An EstimatorSpec.
    """
        hparams = params
        # Deep-copy the model hparams between modes to eliminate
        # side-effects caused by abuse of the linked problem_hparams
        # objects which are used to share modality objects between
        # problems.  We do not want to share the modality objects between
        # modes, since the modality objects may decide to do something
        # mode-specific.  A better fix would be to stop abusing the
        # hparams in this way and instead use a separate dictionary to
        # share the modality objects between problems.  This dictionary
        # could be created once per mode and passed to the constructor of
        # t2t_model.
        my_hp = copy.deepcopy(hparams)

        def initializer():
            if hparams.initializer == "orthogonal":
                return tf.orthogonal_initializer(gain=hparams.initializer_gain)
            elif hparams.initializer == "uniform":
                max_val = 0.1 * hparams.initializer_gain
                return tf.random_uniform_initializer(-max_val, max_val)
            elif hparams.initializer == "normal_unit_scaling":
                return init_ops.variance_scaling_initializer(
                    hparams.initializer_gain,
                    mode="fan_avg",
                    distribution="normal")
            elif hparams.initializer == "uniform_unit_scaling":
                return init_ops.variance_scaling_initializer(
                    hparams.initializer_gain,
                    mode="fan_avg",
                    distribution="uniform")
            else:
                raise ValueError("Unrecognized initializer: %s" %
                                 hparams.initializer)

        def learning_rate_decay():
            """Inverse-decay learning rate until warmup_steps, then decay."""
            warmup_steps = tf.to_float(hparams.learning_rate_warmup_steps *
                                       FLAGS.worker_replicas)
            step = tf.to_float(tf.contrib.framework.get_global_step())
            if hparams.learning_rate_decay_scheme == "noam":
                return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
                    (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5)
            elif hparams.learning_rate_decay_scheme == "exp100k":
                return 0.94**(step // 100000)
            elif hparams.learning_rate_decay_scheme == "cosine":
                cycle_steps = hparams.learning_rate_cosine_cycle_steps
                return 0.5 * (1 + tf.cos(np.pi *
                                         (step % cycle_steps) / cycle_steps))
            elif hparams.learning_rate_decay_scheme == "cyclelinear10x":
                # Cycle the rate linearly by 10x every warmup_steps, up and down.
                cycle_steps = hparams.learning_rate_warmup_steps
                cycle_position = step % (2 * cycle_steps)
                cycle_position = tf.to_float(  # Normalize to the interval [-1, 1].
                    cycle_position - cycle_steps) / float(cycle_steps)
                cycle_position = 1.0 - tf.abs(
                    cycle_position)  # 0 to 1 and back to 0.
                return (cycle_position +
                        0.1) * 3.0  # 10x difference each cycle (0.3-3).

            inv_base = tf.exp(tf.log(0.01) / warmup_steps)
            inv_decay = inv_base**(warmup_steps - step)
            if hparams.learning_rate_decay_scheme == "sqrt":
                decay = _sqrt_decay(step - warmup_steps)
            elif hparams.learning_rate_decay_scheme == "exp10k":
                decay = _exp_decay_after(
                    step - warmup_steps, 0.9995,
                    FLAGS.train_steps - warmup_steps - 10000)
            elif hparams.learning_rate_decay_scheme == "exp50k":
                decay = _exp_decay_after(
                    step - warmup_steps, 0.99995,
                    FLAGS.train_steps - warmup_steps - 50000)
            elif hparams.learning_rate_decay_scheme == "exp500k":
                decay = _exp_decay_after(
                    step - warmup_steps, 0.9999955,
                    FLAGS.train_steps - warmup_steps - 500000)
            elif hparams.learning_rate_decay_scheme == "none":
                decay = tf.constant(1.0)
            else:
                raise ValueError(
                    "Unrecognized learning rate decay scheme: %s" %
                    hparams.learning_rate_decay_scheme)
            return tf.cond(step < warmup_steps,
                           lambda: inv_decay,
                           lambda: decay,
                           name="learning_rate_decay_warump_cond")

        if labels is not None:
            features["targets"] = labels

        dp = devices.data_parallelism()

        tf.get_variable_scope().set_initializer(initializer())
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Add input statistics for incoming features.
        with tf.name_scope("input_stats"):
            for (k, v) in six.iteritems(features):
                if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
                    tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
                    tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
                    nonpadding = tf.to_float(tf.not_equal(v, 0))
                    nonpadding_tokens = tf.reduce_sum(nonpadding)
                    if k == "targets":
                        targets_nonpadding_tokens = nonpadding_tokens
                    tf.summary.scalar("%s_nonpadding_tokens" % k,
                                      nonpadding_tokens)
                    tf.summary.scalar("%s_nonpadding_fraction" % k,
                                      tf.reduce_mean(nonpadding))

            if is_training:
                # The new data reader occasionally emits very small batches, which
                # cause the examples in those batches to be grossly overweighted.
                # We decrease the loss proportionally to the ratio of the size of this
                # batch to the size of the largest training batch ever.
                # TODO(noam): to be more sophisticated, we could keep separate
                # maxima based on problem choice.
                max_nonpadding_var = tf.get_variable(
                    "max_nonpadding",
                    shape=[],
                    initializer=tf.ones_initializer(),
                    trainable=False)
                max_nonpadding = tf.maximum(max_nonpadding_var,
                                            targets_nonpadding_tokens)
                with tf.control_dependencies(
                    [tf.assign(max_nonpadding_var, max_nonpadding)]):
                    small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding
                tf.summary.scalar("small_batch_multiplier",
                                  small_batch_multiplier)

        # Get multi-problem logits and loss based on features["problem_choice"].
        loss_variable_names = []

        def nth_model(n):
            """Build the model for the n-th problem, plus some added variables."""
            model_class = registry.model(model)(
                my_hp, mode, my_hp.problems[n], n, dp,
                devices.ps_devices(all_workers=True))
            if mode == tf.estimator.ModeKeys.PREDICT:
                return model_class.infer(
                    features,
                    beam_size=FLAGS.decode_beam_size,
                    top_beams=(FLAGS.decode_beam_size
                               if FLAGS.decode_return_beams else 1),
                    last_position_only=FLAGS.decode_use_last_position_only,
                    alpha=FLAGS.decode_alpha,
                    decode_length=FLAGS.decode_extra_length)
            # In distributed mode, we build graph for problem=0 and problem=worker_id.
            skipping_is_on = my_hp.problem_choice == "distributed" and is_training
            problem_worker_id = FLAGS.worker_id % len(my_hp.problems)
            skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id
            # On worker 0 also build graph for problems <= 1.
            # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
            skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
            if (FLAGS.eval_run_autoregressive
                    and mode == tf.estimator.ModeKeys.EVAL):
                sharded_logits, losses_dict = model_class.eval_autoregressive(
                    features)
            else:
                sharded_logits, losses_dict = model_class.model_fn(
                    features, skip=(skipping_is_on and skip_this_one))
            with tf.variable_scope("losses_avg"):
                total_loss, ops = 0.0, []
                for loss_key, loss_value in six.iteritems(losses_dict):
                    loss_name = "problem_%d/%s_loss" % (n, loss_key)
                    loss_moving_avg = tf.get_variable(loss_name,
                                                      initializer=100.0,
                                                      trainable=False)
                    loss_variable_names.append(loss_name)
                    ops.append(
                        loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                               loss_value * 0.1))
                    total_loss += loss_value
                try:  # Total loss avg might be reused or not, we try both.
                    with tf.variable_scope(tf.get_variable_scope(),
                                           reuse=True):
                        # Total loss was already constructed on input.
                        loss_moving_avg = tf.get_variable(
                            "problem_%d/total_loss" % n)
                except ValueError:
                    loss_moving_avg = tf.get_variable("problem_%d/total_loss" %
                                                      n,
                                                      initializer=100.0,
                                                      trainable=False)
                ops.append(
                    loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                           total_loss * 0.1))
            with tf.variable_scope(
                    "train_stats"):  # Count steps for this problem.
                problem_steps = tf.get_variable("problem_%d_steps" % n,
                                                initializer=0,
                                                trainable=False)
                ops.append(problem_steps.assign_add(1))
            with tf.control_dependencies(ops):  # Make sure the ops run.
                # Ensure the loss is a scalar here.
                total_loss = tf.reshape(total_loss, [],
                                        name="total_loss_control_id")
            return [total_loss
                    ] + sharded_logits  # Need to flatten for cond later.

        result_list = input_fn_builder.cond_on_index(
            nth_model, features["problem_choice"], 0,
            len(my_hp.problems) - 1)

        if mode == tf.estimator.ModeKeys.PREDICT:
            # Beam search in sequence model returns both decodes withe key "outputs"
            # and scores with they key "scores". If return list is a dict, we expect
            # that it will have keys "outputs", a tensor of int32 and scores, a
            # tensor of floats. This is useful if we want to return scores from
            # estimator.predict
            if not isinstance(result_list, dict):
                predictions = {"outputs": result_list}
            else:
                predictions = {
                    "outputs": result_list["outputs"],
                    "scores": result_list["scores"]
                }

            if "inputs" in features:
                predictions["inputs"] = features["inputs"]
            if "infer_targets" in features:
                predictions["targets"] = features["infer_targets"]
            predictions["problem_choice"] = (
                features["problem_choice"] * tf.ones(
                    (tf.shape(features["inputs"])[0], ), dtype=tf.int32))

            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

        sharded_logits, total_loss = result_list[1:], result_list[0]
        if mode == tf.estimator.ModeKeys.EVAL:
            # For evaluation, return the logits layer as our predictions.
            logits = tf.concat(sharded_logits, 0)

            eval_metrics_fns = metrics.create_evaluation_metrics(
                zip(FLAGS.problems.split("-"), hparams.problem_instances),
                hparams)
            _check_autotune_metrics(eval_metrics_fns)

            eval_metrics = {}
            for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
                eval_metrics[metric_name] = metric_fn(
                    logits, labels, features["problem_choice"])

            return tf.estimator.EstimatorSpec(
                mode,
                predictions={"predictions": logits},
                eval_metric_ops=eval_metrics,
                loss=total_loss)

        assert mode == tf.estimator.ModeKeys.TRAIN

        # Some training statistics.
        with tf.name_scope("training_stats"):
            learning_rate = my_hp.learning_rate * learning_rate_decay()
            learning_rate /= math.sqrt(float(FLAGS.worker_replicas))
            tf.summary.scalar("learning_rate", learning_rate)
            global_step = tf.to_float(tf.contrib.framework.get_global_step())
            for n in xrange(len(my_hp.problems)):
                names_and_vars = []
                with tf.variable_scope("losses_avg", reuse=True):
                    total_loss_var = tf.get_variable("problem_%d/total_loss" %
                                                     n)
                    names_and_vars.append(("total_loss", total_loss_var))
                with tf.variable_scope("losses_avg", reuse=True):
                    for loss_name in loss_variable_names:
                        if loss_name.startswith("problem_%d/" % n):
                            loss_var = tf.get_variable(loss_name)
                            loss_suffix = loss_name[loss_name.index("/") + 1:]
                            names_and_vars.append((loss_suffix, loss_var))
                for (loss_name, loss_var) in names_and_vars:
                    tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name),
                                      loss_var)
                with tf.variable_scope("train_stats", reuse=True):
                    nth_steps = tf.get_variable("problem_%d_steps" % n,
                                                dtype=tf.int32)
                tf.summary.scalar("problem_%d_frequency" % n,
                                  tf.to_float(nth_steps) / (global_step + 1.0))

        # Log trainable weights and add decay.
        total_size, weight_decay_loss = 0, 0.0
        all_weights = {v.name: v for v in tf.trainable_variables()}
        for v_name in sorted(list(all_weights)):
            v = all_weights[v_name]
            v_size = int(np.prod(np.array(v.shape.as_list())))
            total_size += v_size
            if my_hp.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
                # Add weight regularization if set and the weight is not a bias (dim>1).
                with tf.device(v._ref().device):  # pylint: disable=protected-access
                    v_loss = tf.nn.l2_loss(v) / v_size
                weight_decay_loss += v_loss
            is_body = len(v_name) > 5 and v_name[:5] == "body/"
            if my_hp.weight_noise > 0.0 and is_body:
                # Add weight noise if set in my_hp.
                with tf.device(v._ref().device):  # pylint: disable=protected-access
                    scale = learning_rate * 0.001
                    noise = tf.truncated_normal(
                        v.shape) * my_hp.weight_noise * scale
                    noise_op = v.assign_add(noise)
                with tf.control_dependencies([noise_op]):
                    total_loss = tf.identity(total_loss)
        if my_hp.weight_decay > 0.0:
            total_loss += weight_decay_loss * my_hp.weight_decay
        if is_training:
            total_loss *= small_batch_multiplier
        total_loss = tf.identity(total_loss, name="total_loss")
        log_variable_sizes(tf.trainable_variables(), "Trainable Variables")
        diet_vars = [
            v for v in tf.global_variables() if v.dtype == dtypes.float16_ref
        ]
        log_variable_sizes(diet_vars, "Diet Varaibles")
        # Define the train_op for the TRAIN mode.
        opt = _ConditionalOptimizer(my_hp.optimizer, learning_rate, my_hp)
        tf.logging.info("Computing gradients for global model_fn.")
        opt_summaries = ["learning_rate", "loss"]
        if hparams.summarize_grads:
            opt_summaries.extend(["gradients", "gradient_norm"])
        train_op = tf.contrib.layers.optimize_loss(
            name="training",
            loss=total_loss,
            global_step=tf.train.get_global_step(),
            learning_rate=learning_rate,
            clip_gradients=my_hp.clip_grad_norm or None,
            gradient_noise_scale=hparams.grad_noise_scale or None,
            optimizer=opt,
            summaries=opt_summaries,
            colocate_gradients_with_ops=True)

        # Remove summaries that will fail to run because they are in conditionals.
        # TODO(cwhipkey): Test with this code removed, later in 2017.
        summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
        for i in range(len(summaries) - 1, -1, -1):
            if summaries[i].name.startswith("cond_"):
                del summaries[i]

        tf.logging.info("Global model_fn finished.")
        return tf.estimator.EstimatorSpec(
            mode,
            predictions={"problem_choice": features["problem_choice"]},
            loss=total_loss,
            train_op=train_op)
Exemplo n.º 10
0
def model_fn(model,
             features,
             mode,
             hparams,
             problem_names,
             train_steps=100000,
             worker_id=0,
             worker_replicas=1,
             eval_run_autoregressive=False,
             decode_hparams=None):
    """Builds the model for all modes.

  * TRAIN: Constructs loss and train_op
  * EVAL: Constructs the loss and eval metrics
  * PREDICT: Constructs the predictions

  Args:
    model: str, name of model.
    features: dict<feature name, Tensor>. Expected to have keys
      {inputs, targets, problem_choice}.
    mode: tf.estimator.ModeKeys.
    hparams: model HParams.
    problem_names: list of str, names of the problems.
    train_steps: int, total number of training steps. Used to compute learning
      rate decay.
    worker_id: int, id of this worker.
    worker_replicas: int, number of workers.
    eval_run_autoregressive: bool, whether to run evaluation autoregressively.
    decode_hparams: HParams for decode settings. Used when mode == PREDICT.

  Returns:
    tf.estimator.EstimatorSpec
  """
    assert len(problem_names) == len(hparams.problem_instances)
    decode_hp = decode_hparams

    # TODO(rsepassi): This still depends on FLAGS. Rm eventually.
    dp = devices.data_parallelism()

    tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams))
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    # Add input statistics for incoming features.
    with tf.name_scope("input_stats"):
        for (k, v) in six.iteritems(features):
            if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
                tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
                tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
                nonpadding = tf.to_float(tf.not_equal(v, 0))
                nonpadding_tokens = tf.reduce_sum(nonpadding)
                if k == "targets":
                    targets_nonpadding_tokens = nonpadding_tokens
                tf.summary.scalar("%s_nonpadding_tokens" % k,
                                  nonpadding_tokens)
                tf.summary.scalar("%s_nonpadding_fraction" % k,
                                  tf.reduce_mean(nonpadding))

    # Get multi-problem logits and loss based on features["problem_choice"].
    loss_variable_names = []

    def nth_model(n):
        """Build the model for the n-th problem, plus some added variables."""
        model_class = registry.model(model)(
            hparams, mode, hparams.problems[n], n, dp,
            devices.ps_devices(all_workers=True))
        if mode == tf.estimator.ModeKeys.PREDICT:
            return model_class.infer(
                features,
                beam_size=decode_hp.beam_size,
                top_beams=(decode_hp.beam_size
                           if decode_hp.return_beams else 1),
                last_position_only=decode_hp.use_last_position_only,
                alpha=decode_hp.alpha,
                decode_length=decode_hp.extra_length)
        # In distributed mode, we build graph for problem=0 and problem=worker_id.
        skipping_is_on = hparams.problem_choice == "distributed" and is_training
        problem_worker_id = worker_id % len(hparams.problems)
        skip_this_one = n != 0 and n % worker_replicas != problem_worker_id
        # On worker 0 also build graph for problems <= 1.
        # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
        skip_this_one = skip_this_one and (worker_id != 0 or n > 1)
        if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL:
            sharded_logits, losses_dict = model_class.eval_autoregressive(
                features)
        else:
            sharded_logits, losses_dict = model_class.model_fn(
                features, skip=(skipping_is_on and skip_this_one))
        with tf.variable_scope("losses_avg"):
            total_loss, ops = 0.0, []
            for loss_key, loss_value in six.iteritems(losses_dict):
                loss_name = "problem_%d/%s_loss" % (n, loss_key)
                loss_moving_avg = tf.get_variable(loss_name,
                                                  initializer=100.0,
                                                  trainable=False)
                loss_variable_names.append(loss_name)
                ops.append(
                    loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                           loss_value * 0.1))
                total_loss += loss_value
            try:  # Total loss avg might be reused or not, we try both.
                with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                    # Total loss was already constructed on input.
                    loss_moving_avg = tf.get_variable("problem_%d/total_loss" %
                                                      n)
            except ValueError:
                loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n,
                                                  initializer=100.0,
                                                  trainable=False)
            ops.append(
                loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                       total_loss * 0.1))
        with tf.variable_scope("train_stats"):  # Count steps for this problem.
            problem_steps = tf.get_variable("problem_%d_steps" % n,
                                            initializer=0,
                                            trainable=False)
            ops.append(problem_steps.assign_add(1))
        with tf.control_dependencies(ops):  # Make sure the ops run.
            # Ensure the loss is a scalar here.
            total_loss = tf.reshape(total_loss, [],
                                    name="total_loss_control_id")
        return [total_loss, tf.concat(sharded_logits, 0)]

    model_output = input_fn_builder.cond_on_index(
        nth_model,
        index_tensor=features["problem_choice"],
        max_idx=len(hparams.problems) - 1)

    if mode == tf.estimator.ModeKeys.PREDICT:
        # If beam searching, model_output will be a dict with keys "outputs" and
        # "scores".
        if isinstance(model_output, dict):
            outputs = model_output["outputs"]
            scores = model_output["scores"]
        else:
            outputs = model_output
            scores = None

        batched_problem_choice = (features["problem_choice"] * tf.ones(
            (tf.shape(features["inputs"])[0], ), dtype=tf.int32))
        predictions = {
            "outputs": outputs,
            "scores": scores,
            "inputs": features.get("inputs", None),
            "targets": features.get("infer_targets", None),
            "problem_choice": batched_problem_choice,
        }
        _del_dict_nones(predictions)

        export_out = {"outputs": predictions["outputs"]}
        if "scores" in predictions:
            export_out["scores"] = predictions["scores"]

        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions,
            export_outputs={
                "output": tf.estimator.export.PredictOutput(export_out)
            })

    total_loss, logits = model_output

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_fns = metrics.create_evaluation_metrics(
            zip(problem_names, hparams.problem_instances), hparams)

        eval_metrics = {}
        for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
            eval_metrics[metric_name] = metric_fn(logits, features)

        return tf.estimator.EstimatorSpec(mode,
                                          predictions={"predictions": logits},
                                          eval_metric_ops=eval_metrics,
                                          loss=total_loss)

    assert mode == tf.estimator.ModeKeys.TRAIN

    # Set learning rate
    learning_rate = hparams.learning_rate * learning_rate_decay(
        hparams,
        num_worker_replicas=worker_replicas,
        num_train_steps=train_steps)
    learning_rate /= math.sqrt(float(worker_replicas))

    # Get global step
    global_step = tf.train.get_or_create_global_step()

    # Some training statistics.
    with tf.name_scope("training_stats"):
        tf.summary.scalar("learning_rate", learning_rate)
        for n in xrange(len(hparams.problems)):
            names_and_vars = []
            with tf.variable_scope("losses_avg", reuse=True):
                total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
                names_and_vars.append(("total_loss", total_loss_var))
            with tf.variable_scope("losses_avg", reuse=True):
                for loss_name in loss_variable_names:
                    if loss_name.startswith("problem_%d/" % n):
                        loss_var = tf.get_variable(loss_name)
                        loss_suffix = loss_name[loss_name.index("/") + 1:]
                        names_and_vars.append((loss_suffix, loss_var))
            for (loss_name, loss_var) in names_and_vars:
                tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var)
            with tf.variable_scope("train_stats", reuse=True):
                nth_steps = tf.get_variable("problem_%d_steps" % n,
                                            dtype=tf.int32)
            tf.summary.scalar(
                "problem_%d_frequency" % n,
                tf.to_float(nth_steps) / (tf.to_float(global_step) + 1.0))

    # Add weight decay and noise.
    total_size, weight_decay_loss = 0, 0.0
    all_weights = {v.name: v for v in tf.trainable_variables()}
    for v_name in sorted(list(all_weights)):
        v = all_weights[v_name]
        v_size = int(np.prod(np.array(v.shape.as_list())))
        total_size += v_size
        if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
            # Add weight regularization if set and the weight is not a bias (dim>1).
            with tf.device(v._ref().device):  # pylint: disable=protected-access
                v_loss = tf.nn.l2_loss(v) / v_size
            weight_decay_loss += v_loss
        is_body = len(v_name) > 5 and v_name[:5] == "body/"
        if hparams.weight_noise > 0.0 and is_body:
            # Add weight noise if set in hparams.
            with tf.device(v._ref().device):  # pylint: disable=protected-access
                scale = learning_rate * 0.001
                noise = tf.truncated_normal(
                    v.shape) * hparams.weight_noise * scale
                noise_op = v.assign_add(noise)
            with tf.control_dependencies([noise_op]):
                total_loss = tf.identity(total_loss)
    if hparams.weight_decay > 0.0:
        total_loss += weight_decay_loss * hparams.weight_decay

    # The new data reader occasionally emits very small batches, which
    # cause the examples in those batches to be grossly overweighted.
    # We decrease the loss proportionally to the ratio of the size of this
    # batch to the size of the largest training batch ever.
    # TODO(noam): to be more sophisticated, we could keep separate
    # maxima based on problem choice.
    max_nonpadding_var = tf.get_variable("max_nonpadding",
                                         shape=[],
                                         initializer=tf.ones_initializer(),
                                         trainable=False)
    max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens)
    with tf.control_dependencies(
        [tf.assign(max_nonpadding_var, max_nonpadding)]):
        small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding
    tf.summary.scalar("small_batch_multiplier", small_batch_multiplier)
    total_loss *= small_batch_multiplier

    # Log variable sizes
    _log_variable_sizes(tf.trainable_variables(), "Trainable Variables")
    diet_vars = [
        v for v in tf.global_variables() if v.dtype == dtypes.float16_ref
    ]
    _log_variable_sizes(diet_vars, "Diet Variables")

    # Optimize
    total_loss = tf.identity(total_loss, name="total_loss")
    opt = ConditionalOptimizer(hparams.optimizer, learning_rate, hparams)
    opt_summaries = ["learning_rate", "loss"]
    if hparams.summarize_grads:
        opt_summaries.extend(["gradients", "gradient_norm"])
    tf.logging.info("Computing gradients for global model_fn.")
    train_op = tf.contrib.layers.optimize_loss(
        name="training",
        loss=total_loss,
        global_step=global_step,
        learning_rate=learning_rate,
        clip_gradients=hparams.clip_grad_norm or None,
        gradient_noise_scale=hparams.grad_noise_scale or None,
        optimizer=opt,
        summaries=opt_summaries,
        colocate_gradients_with_ops=True)

    # Remove summaries that will fail to run because they are in conditionals.
    # TODO(cwhipkey): Test with this code removed, later in 2017.
    summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
    for i in reversed(range(len(summaries))):
        if summaries[i].name.startswith("cond_"):
            del summaries[i]

    tf.logging.info("Global model_fn finished.")
    return tf.estimator.EstimatorSpec(
        mode,
        predictions={"problem_choice": features["problem_choice"]},
        loss=total_loss,
        train_op=train_op)
Exemplo n.º 11
0
  def model_fn(features, targets, mode):
    """Creates the prediction, loss, and train ops.

    Args:
      features: A dictionary of tensors keyed by the feature name.
      targets: A tensor representing the labels (targets).
      mode: The execution mode, as defined in tf.contrib.learn.ModeKeys.

    Returns:
      A tuple consisting of the prediction, loss, and train_op.
    """
    # Deep-copy the model hparams between modes to eliminate
    # side-effects caused by abuse of the linked problem_hparams
    # objects which are used to share modality objects between
    # problems.  We do not want to share the modality objects between
    # modes, since the modality objects may decide to do something
    # mode-specific.  A better fix would be to stop abusing the
    # hparams in this way and instead use a separate dictionary to
    # share the modality objects between problems.  This dictionary
    # could be created once per mode and passed to the constructor of
    # t2t_model.
    my_hp = copy.deepcopy(hparams)
    if mode == tf.contrib.learn.ModeKeys.INFER:
      if FLAGS.decode_interactive:
        features = _interactive_input_tensor_to_features_dict(features, my_hp)
      elif FLAGS.decode_from_file:
        features = _decode_input_tensor_to_features_dict(features, my_hp)
    # A dictionary containing:
    #  - problem_choice: A Tensor containing an integer indicating which problem
    #                    was selected for this run.
    #  - predictions: A Tensor containing the model's output predictions.
    run_info = dict()
    run_info["problem_choice"] = features["problem_choice"]

    if targets is not None:
      features["targets"] = targets

    dp = devices.data_parallelism()

    tf.get_variable_scope().set_initializer(initializer())
    is_training = mode == tf.contrib.learn.ModeKeys.TRAIN

    # Add input statistics for incoming features.
    with tf.name_scope("input_stats"):
      for (k, v) in six.iteritems(features):
        if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
          tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
          tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
          nonpadding = tf.to_float(tf.not_equal(v, 0))
          nonpadding_tokens = tf.reduce_sum(nonpadding)
          if k == "targets":
            targets_nonpadding_tokens = nonpadding_tokens
          tf.summary.scalar("%s_nonpadding_tokens" % k, nonpadding_tokens)
          tf.summary.scalar("%s_nonpadding_fraction" % k,
                            tf.reduce_mean(nonpadding))

      # The new data reader occasionally emits very small batches, which
      # cause the examples in those batches to be grossly overweighted.
      # We decrease the loss proportionally to the ratio of the size of this
      # batch to the size of the largest training batch ever.
      # TODO(noam): to be more sophisticated, we could keep separate
      # maxima based on problem choice.
      max_nonpadding_var = tf.get_variable(
          "max_nonpadding", shape=[],
          initializer=tf.ones_initializer(), trainable=False)
      max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens)
      if is_training:
        with tf.control_dependencies(
            [tf.assign(max_nonpadding_var, max_nonpadding)]):
          small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding
        tf.summary.scalar("small_batch_multiplier", small_batch_multiplier)

    # Get multi-problem logits and loss based on features["problem_choice"].
    loss_variable_names = []
    def nth_model(n):
      """Build the model for the n-th problem, plus some added variables."""
      model_class = registry.model(model)(
          my_hp,
          mode,
          my_hp.problems[n],
          n,
          dp,
          devices.ps_devices(all_workers=True))
      if mode == tf.contrib.learn.ModeKeys.INFER:
        return model_class.infer(
            features,
            beam_size=FLAGS.decode_beam_size,
            top_beams=(FLAGS.decode_beam_size
                       if FLAGS.decode_return_beams else 1),
            last_position_only=FLAGS.decode_use_last_position_only,
            alpha=FLAGS.decode_alpha,
            decode_length=FLAGS.decode_extra_length)
      # In distributed mode, we build graph for problem=0 and problem=worker_id.
      skipping_is_on = my_hp.problem_choice == "distributed" and is_training
      problem_worker_id = FLAGS.worker_id % len(my_hp.problems)
      skip_this_one = n != 0 and n % FLAGS.worker_replicas != problem_worker_id
      # On worker 0 also build graph for problems <= 1.
      # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
      skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
      if (FLAGS.eval_run_autoregressive and
          mode == tf.contrib.learn.ModeKeys.EVAL):
        sharded_logits, losses_dict = model_class.eval_autoregressive(features)
      else:
        sharded_logits, losses_dict = model_class.model_fn(
            features, skip=(skipping_is_on and skip_this_one))
      with tf.variable_scope("losses_avg"):
        total_loss, ops = 0.0, []
        for loss_key, loss_value in six.iteritems(losses_dict):
          loss_name = "problem_%d/%s_loss" % (n, loss_key)
          loss_moving_avg = tf.get_variable(
              loss_name, initializer=100.0, trainable=False)
          loss_variable_names.append(loss_name)
          ops.append(
              loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1))
          total_loss += loss_value
        try:  # Total loss avg might be reused or not, we try both.
          with tf.variable_scope(tf.get_variable_scope(), reuse=True):
            # Total loss was already constructed on input.
            loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
        except ValueError:
          loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n,
                                            initializer=100.0, trainable=False)
        ops.append(
            loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1))
      with tf.variable_scope("train_stats"):  # Count steps for this problem.
        problem_steps = tf.get_variable(
            "problem_%d_steps" % n, initializer=0, trainable=False)
        ops.append(problem_steps.assign_add(1))
      with tf.control_dependencies(ops):  # Make sure the ops run.
        # Ensure the loss is a scalar here.
        total_loss = tf.reshape(total_loss, [], name="total_loss_control_id")
      return [total_loss] + sharded_logits  # Need to flatten for cond later.

    result_list = input_fn_builder.cond_on_index(nth_model,
                                                 features["problem_choice"], 0,
                                                 len(my_hp.problems) - 1)

    if mode == tf.contrib.learn.ModeKeys.INFER:
      # Beam search in sequence model returns both decodes withe key "outputs"
      # and scores with they key "scores". If return list is a dict, we expect
      # that it will have keys "outputs", a tensor of int32 and scores, a
      # tensor of floats. This is useful if we want to return scores from
      # estimator.predict
      if not isinstance(result_list, dict):
        ret = {"outputs": result_list}, None, None
      else:
        ret = {
            "outputs": result_list["outputs"],
            "scores": result_list["scores"]
        }, None, None
      if "inputs" in features:
        ret[0]["inputs"] = features["inputs"]
      if "infer_targets" in features:
        ret[0]["targets"] = features["infer_targets"]
      return ret

    sharded_logits, total_loss = result_list[1:], result_list[0]
    if mode == tf.contrib.learn.ModeKeys.EVAL:
      logits = tf.concat(sharded_logits, 0)
      # For evaluation, return the logits layer as our predictions.
      run_info["predictions"] = logits
      train_op = None
      return run_info, total_loss, None

    assert mode == tf.contrib.learn.ModeKeys.TRAIN

    # Some training statistics.
    with tf.name_scope("training_stats"):
      learning_rate = my_hp.learning_rate * learning_rate_decay()
      learning_rate /= math.sqrt(float(FLAGS.worker_replicas))
      tf.summary.scalar("learning_rate", learning_rate)
      global_step = tf.to_float(tf.contrib.framework.get_global_step())
      for n in xrange(len(my_hp.problems)):
        names_and_vars = []
        with tf.variable_scope("losses_avg", reuse=True):
          total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
          names_and_vars.append(("total_loss", total_loss_var))
        with tf.variable_scope("losses_avg", reuse=True):
          for loss_name in loss_variable_names:
            if loss_name.startswith("problem_%d/" % n):
              loss_var = tf.get_variable(loss_name)
              loss_suffix = loss_name[loss_name.index("/") + 1:]
              names_and_vars.append((loss_suffix, loss_var))
        for (loss_name, loss_var) in names_and_vars:
          tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var)
        with tf.variable_scope("train_stats", reuse=True):
          nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32)
        tf.summary.scalar("problem_%d_frequency" % n,
                          tf.to_float(nth_steps) / (global_step + 1.0))

    # Log trainable weights and add decay.
    total_size, weight_decay_loss = 0, 0.0
    all_weights = {v.name: v for v in tf.trainable_variables()}
    for v_name in sorted(list(all_weights)):
      v = all_weights[v_name]
      v_size = int(np.prod(np.array(v.shape.as_list())))
      total_size += v_size
      if my_hp.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
        # Add weight regularization if set and the weight is not a bias (dim>1).
        with tf.device(v._ref().device):  # pylint: disable=protected-access
          v_loss = tf.nn.l2_loss(v) / v_size
        weight_decay_loss += v_loss
      is_body = len(v_name) > 5 and v_name[:5] == "body/"
      if my_hp.weight_noise > 0.0 and is_body:
        # Add weight noise if set in my_hp.
        with tf.device(v._ref().device):  # pylint: disable=protected-access
          scale = learning_rate * 0.001
          noise = tf.truncated_normal(v.shape) * my_hp.weight_noise * scale
          noise_op = v.assign_add(noise)
        with tf.control_dependencies([noise_op]):
          total_loss = tf.identity(total_loss)
    if my_hp.weight_decay > 0.0:
      total_loss += weight_decay_loss * my_hp.weight_decay
    if is_training:
      total_loss *= small_batch_multiplier
    total_loss = tf.identity(total_loss, name="total_loss")
    log_variable_sizes(tf.trainable_variables(), "Trainable Variables")
    diet_vars = [v for v in tf.global_variables() if hasattr(v, "optimizer")]
    log_variable_sizes(diet_vars, "Diet Varaibles")
    # Define the train_op for the TRAIN mode.
    opt = _ConditionalOptimizer(my_hp.optimizer, learning_rate, my_hp)
    tf.logging.info("Computing gradients for global model_fn.")
    opt_summaries = ["learning_rate", "loss"]
    if hparams.summarize_grads:
      opt_summaries.extend(["gradients", "gradient_norm"])
    train_op = tf.contrib.layers.optimize_loss(
        name="training",
        loss=total_loss,
        global_step=tf.train.get_global_step(),
        learning_rate=learning_rate,
        clip_gradients=my_hp.clip_grad_norm or None,
        gradient_noise_scale=hparams.grad_noise_scale or None,
        optimizer=opt,
        summaries=opt_summaries,
        colocate_gradients_with_ops=True)

    # Remove summaries that will fail to run because they are in conditionals.
    # TODO(cwhipkey): Test with this code removed, later in 2017.
    summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
    for i in range(len(summaries) - 1, -1, -1):
      if summaries[i].name.startswith("cond_"):
        del summaries[i]

    tf.logging.info("Global model_fn finished.")
    return run_info, total_loss, train_op
Exemplo n.º 12
0
def decode_from_dataset(estimator,
                        problem_names,
                        decode_hp,
                        decode_to_file=None,
                        dataset_split=None):
  tf.logging.info("Performing local inference from dataset for %s.",
                  str(problem_names))
  hparams = estimator.params
  # We assume that worker_id corresponds to shard number.
  shard = decode_hp.shard_id if decode_hp.shards > 1 else None

  for problem_idx, problem_name in enumerate(problem_names):
    # Build the inference input function
    infer_input_fn = input_fn_builder.build_input_fn(
        mode=tf.estimator.ModeKeys.PREDICT,
        hparams=hparams,
        data_dir=hparams.data_dir,
        num_datashards=devices.data_parallelism().n,
        fixed_problem=problem_idx,
        batch_size=decode_hp.batch_size,
        dataset_split=dataset_split,
        shard=shard)

    # Get the predictions as an iterable
    predictions = estimator.predict(infer_input_fn)

    # Prepare output file writers if decode_to_file passed
    if decode_to_file:
      if decode_hp.shards > 1:
        decode_filename = decode_to_file + ("%.2d" % decode_hp.shard_id)
      else:
        decode_filename = decode_to_file
      output_filepath = _decode_filename(decode_filename, problem_name,
                                         decode_hp)
      parts = output_filepath.split(".")
      parts[-1] = "targets"
      target_filepath = ".".join(parts)

      output_file = tf.gfile.Open(output_filepath, "w")
      target_file = tf.gfile.Open(target_filepath, "w")

    problem_hparams = hparams.problems[problem_idx]
    # Inputs vocabulary is set to targets if there are no inputs in the problem,
    # e.g., for language models where the inputs are just a prefix of targets.
    has_input = "inputs" in problem_hparams.vocabulary
    inputs_vocab_key = "inputs" if has_input else "targets"
    inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key]
    targets_vocab = problem_hparams.vocabulary["targets"]
    for num_predictions, prediction in enumerate(predictions):
      num_predictions += 1
      inputs = prediction["inputs"]
      targets = prediction["targets"]
      outputs = prediction["outputs"]

      # Log predictions
      decoded_outputs = []
      if decode_hp.return_beams:
        output_beams = np.split(outputs, decode_hp.beam_size, axis=0)
        for i, beam in enumerate(output_beams):
          tf.logging.info("BEAM %d:" % i)
          decoded = log_decode_results(
              inputs,
              beam,
              problem_name,
              num_predictions,
              inputs_vocab,
              targets_vocab,
              save_images=decode_hp.save_images,
              model_dir=estimator.model_dir,
              identity_output=decode_hp.identity_output,
              targets=targets)
          decoded_outputs.append(decoded)
      else:
        decoded = log_decode_results(
            inputs,
            outputs,
            problem_name,
            num_predictions,
            inputs_vocab,
            targets_vocab,
            save_images=decode_hp.save_images,
            model_dir=estimator.model_dir,
            identity_output=decode_hp.identity_output,
            targets=targets)
        decoded_outputs.append(decoded)

      # Write out predictions if decode_to_file passed
      if decode_to_file:
        for decoded_output, decoded_target in decoded_outputs:
          output_file.write(str(decoded_output) + decode_hp.delimiter)
          target_file.write(str(decoded_target) + decode_hp.delimiter)

      if (decode_hp.num_samples >= 0 and
          num_predictions >= decode_hp.num_samples):
        break

    if decode_to_file:
      output_file.close()
      target_file.close()

    tf.logging.info("Completed inference on %d samples." % num_predictions)  # pylint: disable=undefined-loop-variable
Exemplo n.º 13
0
def create_run_config(model_name,
                      master="",
                      model_dir=None,
                      iterations_per_loop=1000,
                      num_shards=8,
                      log_device_placement=False,
                      save_checkpoints_steps=1000,
                      save_checkpoints_secs=None,
                      keep_checkpoint_max=20,
                      keep_checkpoint_every_n_hours=10000,
                      num_gpus=1,
                      gpu_order="",
                      num_async_replicas=1,
                      enable_graph_rewriter=False,
                      gpu_mem_fraction=0.95,
                      no_data_parallelism=False,
                      optionally_use_dist_strat=False,
                      daisy_chain_variables=True,
                      schedule="continuous_train_and_eval",
                      worker_job="/job:localhost",
                      worker_id=0,
                      ps_replicas=0,
                      ps_job="/job:ps",
                      ps_gpu=0,
                      random_seed=None,
                      sync=False,
                      tpu_infeed_sleep_secs=None,
                      use_tpu=False,
                      use_tpu_estimator=False,
                      xla_jit_level=tf.OptimizerOptions.OFF,
                      inter_op_parallelism_threads=0,
                      log_step_count_steps=100,
                      intra_op_parallelism_threads=0,
                      tpu_config_extra_kwargs=None,
                      cloud_tpu_name=""):
    """Create RunConfig, TPUConfig, and Parallelism object."""
    session_config = create_session_config(
        log_device_placement=log_device_placement,
        enable_graph_rewriter=enable_graph_rewriter,
        gpu_mem_fraction=gpu_mem_fraction,
        use_tpu=use_tpu,
        xla_jit_level=xla_jit_level,
        inter_op_parallelism_threads=inter_op_parallelism_threads,
        intra_op_parallelism_threads=intra_op_parallelism_threads)
    run_config_args = {
        "master": master,
        "evaluation_master": master,
        "model_dir": model_dir,
        "session_config": session_config,
        "save_summary_steps": 100,
        "save_checkpoints_steps": save_checkpoints_steps,
        "save_checkpoints_secs": save_checkpoints_secs,
        "keep_checkpoint_max": keep_checkpoint_max,
        "keep_checkpoint_every_n_hours": keep_checkpoint_every_n_hours,
        "tf_random_seed": random_seed,
        "log_step_count_steps": log_step_count_steps
    }
    if save_checkpoints_secs:
        del run_config_args["save_checkpoints_steps"]
    run_config_cls = tf.contrib.learn.RunConfig

    if use_tpu or use_tpu_estimator:
        # If using TPUEstimator, use TPU RunConfig, add TPUConfig, and add
        # additional args.
        tpu_config_kwargs = {
            "iterations_per_loop": iterations_per_loop,
            "num_shards": num_shards,
            "per_host_input_for_training": True,
            "initial_infeed_sleep_secs": tpu_infeed_sleep_secs,
        }
        if tpu_config_extra_kwargs is not None:
            tpu_config_kwargs.update(tpu_config_extra_kwargs)
        run_config_cls = tf.contrib.tpu.RunConfig
        tpu_config = tf.contrib.tpu.TPUConfig(**tpu_config_kwargs)
        run_config_args["tpu_config"] = tpu_config
        if not master and "KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS" in os.environ:
            # If running on TPU but no master is set and the KUBE env var is present
            # then we're running on ML Engine. Set the master.
            run_config_args["master"] = os.environ[
                "KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS"]
            run_config_args["evaluation_master"] = run_config_args["master"]
        elif not master and cloud_tpu_name:
            # Update run_config to use cluster instead of master/evaluation_master
            # as we need the cluster spec to use Cloud Pods
            tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
                cloud_tpu_name)
            run_config_args["cluster"] = tpu_cluster_resolver
            del run_config_args["master"]
            del run_config_args["evaluation_master"]
    elif is_cloud_async_distributed():
        run_config_cls = tf.estimator.RunConfig
        del run_config_args["master"]
        del run_config_args["evaluation_master"]

    config = run_config_cls(**run_config_args)

    # If not using TPU, add device info for data_parallelism
    config.use_tpu = use_tpu
    if not use_tpu:
        config.t2t_device_info = {
            "num_async_replicas": num_async_replicas,
        }
        use_distribution_strategy = (
            optionally_use_dist_strat
            and t2t_model.T2TModel.has_symmetric_shards(model_name)
            and not no_data_parallelism and ps_replicas == 0 and ps_gpu == 0
            and num_async_replicas == 1)

        if use_distribution_strategy:
            tf.logging.info(
                "Configuring MirroredStrategy DistributionStrategy to replicate the "
                "model.")
            distribution = tf.contrib.distribute.MirroredStrategy()
            config = config.replace(train_distribute=distribution)
            config.data_parallelism = None
        else:
            tf.logging.info(
                "Configuring DataParallelism to replicate the model.")
            config.data_parallelism = devices.data_parallelism(
                daisy_chain_variables=daisy_chain_variables,
                ps_replicas=ps_replicas,
                ps_job=ps_job,
                ps_gpu=ps_gpu,
                schedule=schedule,
                sync=sync,
                worker_gpu=num_gpus,
                worker_replicas=num_async_replicas,
                worker_id=worker_id,
                gpu_order=gpu_order,
                worker_job=worker_job,
                no_data_parallelism=no_data_parallelism)

    return config
Exemplo n.º 14
0
def model_fn(model,
             features,
             mode,
             hparams,
             problem_names,
             train_steps=100000,
             worker_id=0,
             worker_replicas=1,
             eval_run_autoregressive=False,
             decode_hparams=None):
  """Builds the model for all modes.

  * TRAIN: Constructs loss and train_op
  * EVAL: Constructs the loss and eval metrics
  * PREDICT: Constructs the predictions

  Args:
    model: str, name of model.
    features: dict<feature name, Tensor>. Expected to have keys
      {inputs, targets, problem_choice}.
    mode: tf.estimator.ModeKeys.
    hparams: model HParams.
    problem_names: list of str, names of the problems.
    train_steps: int, total number of training steps. Used to compute learning
      rate decay.
    worker_id: int, id of this worker.
    worker_replicas: int, number of workers.
    eval_run_autoregressive: bool, whether to run evaluation autoregressively.
    decode_hparams: HParams for decode settings. Used when mode == PREDICT.

  Returns:
    tf.estimator.EstimatorSpec
  """
  assert len(problem_names) == len(hparams.problem_instances)
  decode_hp = decode_hparams

  # TODO(rsepassi): This still depends on FLAGS. Rm eventually.
  dp = devices.data_parallelism(hparams)

  tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams))
  is_training = mode == tf.estimator.ModeKeys.TRAIN

  # Add input statistics for incoming features.
  with tf.name_scope("input_stats"):
    for (k, v) in six.iteritems(features):
      if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
        tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
        tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
        nonpadding = tf.to_float(tf.not_equal(v, 0))
        nonpadding_tokens = tf.reduce_sum(nonpadding)
        if k == "targets":
          targets_nonpadding_tokens = nonpadding_tokens
        tf.summary.scalar("%s_nonpadding_tokens" % k, nonpadding_tokens)
        tf.summary.scalar("%s_nonpadding_fraction" % k,
                          tf.reduce_mean(nonpadding))

  # Get multi-problem logits and loss based on features["problem_choice"].
  loss_variable_names = []

  def nth_model(n):
    """Build the model for the n-th problem, plus some added variables."""
    model_class = registry.model(model)(
        hparams,
        mode,
        hparams.problems[n],
        n,
        dp,
        devices.ps_devices(all_workers=True),
        decode_hparams=decode_hparams)
    if mode == tf.estimator.ModeKeys.PREDICT:
      return model_class.infer(
          features,
          beam_size=decode_hp.beam_size,
          top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1),
          alpha=decode_hp.alpha,
          decode_length=decode_hp.extra_length)
    # In distributed mode, we build graph for problem=0 and problem=worker_id.
    skipping_is_on = hparams.problem_choice == "distributed" and is_training
    problem_worker_id = worker_id % len(hparams.problems)
    skip_this_one = n != 0 and n % worker_replicas != problem_worker_id
    # On worker 0 also build graph for problems <= 1.
    # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
    skip_this_one = skip_this_one and (worker_id != 0 or n > 1)
    if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL:
      logits, losses_dict = model_class.eval_autoregressive(features)
    else:
      logits, losses_dict = model_class(
          features, skip=(skipping_is_on and skip_this_one))
    with tf.variable_scope("losses_avg"):
      total_loss, ops = 0.0, []
      for loss_key, loss_value in six.iteritems(losses_dict):
        loss_name = "problem_%d/%s_loss" % (n, loss_key)
        loss_moving_avg = tf.get_variable(
            loss_name, initializer=100.0, trainable=False)
        loss_variable_names.append(loss_name)
        ops.append(
            loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1))
        total_loss += loss_value
      try:  # Total loss avg might be reused or not, we try both.
        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
          # Total loss was already constructed on input.
          loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
      except ValueError:
        loss_moving_avg = tf.get_variable(
            "problem_%d/total_loss" % n, initializer=100.0, trainable=False)
      ops.append(
          loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1))
    with tf.variable_scope("train_stats"):  # Count steps for this problem.
      problem_steps = tf.get_variable(
          "problem_%d_steps" % n, initializer=0, trainable=False)
      ops.append(problem_steps.assign_add(1))
    with tf.control_dependencies(ops):  # Make sure the ops run.
      # Ensure the loss is a scalar here.
      total_loss = tf.reshape(total_loss, [], name="total_loss_control_id")
    return [total_loss, logits]

  model_output = input_fn_builder.cond_on_index(
      nth_model,
      index_tensor=features["problem_choice"],
      max_idx=len(hparams.problems) - 1)

  if mode == tf.estimator.ModeKeys.PREDICT:
    # If beam searching, model_output will be a dict with keys "outputs" and
    # "scores".
    if isinstance(model_output, dict):
      outputs = model_output["outputs"]
      scores = model_output["scores"]
    else:
      outputs = model_output
      scores = None

    batched_problem_choice = (
        features["problem_choice"] * tf.ones(
            (tf.shape(features["inputs"])[0],), dtype=tf.int32))
    predictions = {
        "outputs": outputs,
        "scores": scores,
        "inputs": features.get("inputs", None),
        "targets": features.get("infer_targets", None),
        "problem_choice": batched_problem_choice,
    }
    _del_dict_nones(predictions)

    export_out = {"outputs": predictions["outputs"]}
    if "scores" in predictions:
      export_out["scores"] = predictions["scores"]

    return tf.estimator.EstimatorSpec(
        mode,
        predictions=predictions,
        export_outputs={
            "output": tf.estimator.export.PredictOutput(export_out)
        })

  total_loss, logits = model_output

  if mode == tf.estimator.ModeKeys.EVAL:
    eval_metrics_fns = metrics.create_evaluation_metrics(
        hparams.problem_instances, hparams)

    eval_metrics = {}
    for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
      eval_metrics[metric_name] = metric_fn(logits, features)

    return tf.estimator.EstimatorSpec(
        mode,
        predictions={"predictions": logits},
        eval_metric_ops=eval_metrics,
        loss=total_loss)

  assert mode == tf.estimator.ModeKeys.TRAIN

  # Set learning rate
  learning_rate = hparams.learning_rate * optimize.learning_rate_decay(
      hparams, num_worker_replicas=worker_replicas, num_train_steps=train_steps)
  learning_rate /= math.sqrt(float(worker_replicas))

  # Get global step
  global_step = tf.train.get_or_create_global_step()

  # Some training statistics.
  with tf.name_scope("training_stats"):
    tf.summary.scalar("learning_rate", learning_rate)
    for n in xrange(len(hparams.problems)):
      names_and_vars = []
      with tf.variable_scope("losses_avg", reuse=True):
        total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
        names_and_vars.append(("total_loss", total_loss_var))
      with tf.variable_scope("losses_avg", reuse=True):
        for loss_name in loss_variable_names:
          if loss_name.startswith("problem_%d/" % n):
            loss_var = tf.get_variable(loss_name)
            loss_suffix = loss_name[loss_name.index("/") + 1:]
            names_and_vars.append((loss_suffix, loss_var))
      for (loss_name, loss_var) in names_and_vars:
        tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var)
      with tf.variable_scope("train_stats", reuse=True):
        nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32)
      tf.summary.scalar("problem_%d_frequency" % n,
                        tf.to_float(nth_steps) /
                        (tf.to_float(global_step) + 1.0))

  # Add weight decay and noise.
  total_size, weight_decay_loss = 0, 0.0
  all_weights = {v.name: v for v in tf.trainable_variables()}
  for v_name in sorted(list(all_weights)):
    v = all_weights[v_name]
    v_size = int(np.prod(np.array(v.shape.as_list())))
    total_size += v_size
    if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
      # Add weight regularization if set and the weight is not a bias (dim>1).
      with tf.device(v._ref().device):  # pylint: disable=protected-access
        v_loss = tf.nn.l2_loss(v) / v_size
      weight_decay_loss += v_loss
    is_body = len(v_name) > 5 and v_name[:5] == "body/"
    if hparams.weight_noise > 0.0 and is_body:
      # Add weight noise if set in hparams.
      with tf.device(v._ref().device):  # pylint: disable=protected-access
        scale = learning_rate * 0.001
        noise = tf.truncated_normal(v.shape) * hparams.weight_noise * scale
        noise_op = v.assign_add(noise)
      with tf.control_dependencies([noise_op]):
        total_loss = tf.identity(total_loss)
  if hparams.weight_decay > 0.0:
    total_loss += weight_decay_loss * hparams.weight_decay

  # The new data reader occasionally emits very small batches, which
  # cause the examples in those batches to be grossly overweighted.
  # We decrease the loss proportionally to the ratio of the size of this
  # batch to the size of the largest training batch ever.
  # TODO(noam): to be more sophisticated, we could keep separate
  # maxima based on problem choice.
  max_nonpadding_var = tf.get_variable(
      "max_nonpadding",
      shape=[],
      initializer=tf.ones_initializer(),
      trainable=False)
  max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens)
  with tf.control_dependencies([tf.assign(max_nonpadding_var, max_nonpadding)]):
    small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding
  tf.summary.scalar("small_batch_multiplier", small_batch_multiplier)
  total_loss *= small_batch_multiplier

  # Log variable sizes
  _log_variable_sizes(tf.trainable_variables(), "Trainable Variables")
  diet_vars = [
      v for v in tf.global_variables() if v.dtype == dtypes.float16_ref
  ]
  _log_variable_sizes(diet_vars, "Diet Variables")

  # Optimize
  train_op = optimize.optimize(total_loss, learning_rate, hparams)

  # Remove summaries that will fail to run because they are in conditionals.
  # TODO(cwhipkey): Test with this code removed, later in 2017.
  summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
  for i in reversed(range(len(summaries))):
    if summaries[i].name.startswith("cond_"):
      del summaries[i]

  tf.logging.info("Global model_fn finished.")
  return tf.estimator.EstimatorSpec(
      mode,
      predictions={"problem_choice": features["problem_choice"]},
      loss=total_loss,
      train_op=train_op)
Exemplo n.º 15
0
def decode_from_dataset(estimator):
    hparams = estimator.hparams
    for i, problem in enumerate(FLAGS.problems.split("-")):
        inputs_vocab = hparams.problems[i].vocabulary.get("inputs", None)
        targets_vocab = hparams.problems[i].vocabulary["targets"]
        tf.logging.info("Performing local inference.")
        infer_problems_data = data_reader.get_data_filepatterns(
            FLAGS.problems, hparams.data_dir, tf.contrib.learn.ModeKeys.INFER)

        infer_input_fn = input_fn_builder.build_input_fn(
            mode=tf.contrib.learn.ModeKeys.INFER,
            hparams=hparams,
            data_file_patterns=infer_problems_data,
            num_datashards=devices.data_parallelism().n,
            fixed_problem=i)

        def log_fn(inputs,
                   targets,
                   outputs,
                   problem,
                   j,
                   inputs_vocab=inputs_vocab,
                   targets_vocab=targets_vocab):
            """Log inference results."""
            if "image" in problem and FLAGS.decode_save_images:
                save_path = os.path.join(estimator.model_dir,
                                         "%s_prediction_%d.jpg" % (problem, j))
                show_and_save_image(inputs / 255., save_path)
            elif inputs_vocab:
                decoded_inputs = inputs_vocab.decode(
                    _save_until_eos(inputs.flatten()))
                tf.logging.info("Inference results INPUT: %s" % decoded_inputs)

            if FLAGS.identity_output:
                decoded_outputs = " ".join(map(str, outputs.flatten()))
                decoded_targets = " ".join(map(str, targets.flatten()))
            else:
                decoded_outputs = targets_vocab.decode(
                    _save_until_eos(outputs.flatten()))
                decoded_targets = targets_vocab.decode(
                    _save_until_eos(targets.flatten()))

            tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
            tf.logging.info("Inference results TARGET: %s" % decoded_targets)
            if FLAGS.decode_to_file:
                output_filepath = FLAGS.decode_to_file + ".outputs." + problem
                output_file = tf.gfile.Open(output_filepath, "a")
                output_file.write(decoded_outputs + "\n")
                target_filepath = FLAGS.decode_to_file + ".targets." + problem
                target_file = tf.gfile.Open(target_filepath, "a")
                target_file.write(decoded_targets + "\n")

        result_iter = estimator.predict(input_fn=infer_input_fn,
                                        as_iterable=True)
        count = 0
        for result in result_iter:
            # predictions from the test input. We use it to log inputs and decodes.
            inputs = result["inputs"]
            targets = result["targets"]
            outputs = result["outputs"]
            if FLAGS.decode_return_beams:
                output_beams = np.split(outputs,
                                        FLAGS.decode_beam_size,
                                        axis=0)
                for k, beam in enumerate(output_beams):
                    tf.logging.info("BEAM %d:" % k)
                    log_fn(inputs, targets, beam, problem, count)
            else:
                log_fn(inputs, targets, outputs, problem, count)

            count += 1
            if FLAGS.decode_num_samples != -1 and count >= FLAGS.decode_num_samples:
                break
        tf.logging.info("Completed inference on %d samples." % count)
Exemplo n.º 16
0
def model_fn(model,
             features,
             mode,
             hparams,
             problem_names,
             train_steps=100000,
             worker_id=0,
             worker_replicas=1,
             eval_run_autoregressive=False,
             decode_hparams=None):
    """Builds the model for all modes.

  * TRAIN: Constructs loss and train_op
  * EVAL: Constructs the loss and eval metrics
  * PREDICT: Constructs the predictions

  Args:
    model: str, name of model.
    features: dict<feature name, Tensor>. Expected to have keys
      {inputs, targets, problem_choice}.
    mode: tf.estimator.ModeKeys.
    hparams: model HParams.
    problem_names: list of str, names of the problems.
    train_steps: int, total number of training steps. Used to compute learning
      rate decay.
    worker_id: int, id of this worker.
    worker_replicas: int, number of workers.
    eval_run_autoregressive: bool, whether to run evaluation autoregressively.
    decode_hparams: HParams for decode settings. Used when mode == PREDICT.

  Returns:
    tf.estimator.EstimatorSpec
  """
    assert len(problem_names) == len(hparams.problem_instances)
    decode_hp = decode_hparams

    # TODO(rsepassi): This still depends on FLAGS. Rm eventually.
    dp = devices.data_parallelism()

    tf.get_variable_scope().set_initializer(_get_variable_initializer(hparams))
    # set the initializer functions
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    # Add input statistics for incoming features.
    with tf.name_scope("input_stats"):
        for (k, v) in six.iteritems(features):
            if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
                tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // dp.n)
                tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
                nonpadding = tf.to_float(tf.not_equal(v, 0))
                nonpadding_tokens = tf.reduce_sum(
                    nonpadding)  # non zeros tokens
                if k == "targets":
                    targets_nonpadding_tokens = nonpadding_tokens
                tf.summary.scalar("%s_nonpadding_tokens" % k,
                                  nonpadding_tokens)
                tf.summary.scalar("%s_nonpadding_fraction" % k,
                                  tf.reduce_mean(nonpadding))

    # Get multi-problem logits and loss based on features["problem_choice"].
    loss_variable_names = []

    def nth_model(n):
        """Build the model for the n-th problem, plus some added variables."""
        model_class = registry.model(model)(
            hparams,
            mode,
            hparams.problems[n],
            n,
            dp,
            devices.ps_devices(all_workers=True),
            decode_hparams=decode_hparams
        )  # initialize transformer model class: hparams, modalities
        if mode == tf.estimator.ModeKeys.PREDICT:
            return model_class.infer(features,
                                     beam_size=decode_hp.beam_size,
                                     top_beams=(decode_hp.beam_size if
                                                decode_hp.return_beams else 1),
                                     alpha=decode_hp.alpha,
                                     decode_length=decode_hp.extra_length)
        # In distributed mode, we build graph for problem=0 and problem=worker_id.
        skipping_is_on = hparams.problem_choice == "distributed" and is_training
        problem_worker_id = worker_id % len(hparams.problems)
        skip_this_one = n != 0 and n % worker_replicas != problem_worker_id
        # On worker 0 also build graph for problems <= 1.
        # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
        skip_this_one = skip_this_one and (worker_id != 0 or n > 1)
        mrt_samples = getattr(hparams, 'mrt_samples', None)
        if eval_run_autoregressive and mode == tf.estimator.ModeKeys.EVAL:  # evaluation mode
            sharded_logits, losses_dict = model_class.eval_autoregressive(
                features)
        else:  # training mode
            if hparams.rl:
                # generate sample data, it will automatically sharded, samples shape [batch, time, 1, 1]
                if model_class._num_datashards == 1:  # work on single GPU cards, fast sample
                    print("###Work on Single GPU card, Use Fast Decode.###")
                    train_beam = getattr(hparams, 'train_beam', None)
                    if mrt_samples:
                        samples, _ = model_class._fast_decode(
                            features,
                            decode_length=50,
                            beam_size=mrt_samples,
                            top_beams=mrt_samples)
                        inputs = tf.squeeze(tf.squeeze(features["inputs"],
                                                       axis=-1),
                                            axis=-1)
                        targets = tf.squeeze(tf.squeeze(features["targets"],
                                                        axis=-1),
                                             axis=-1)
                        batch_size = tf.shape(inputs)[0]
                        inputs_len = tf.shape(inputs)[1]
                        targets_len = tf.shape(targets)[1]
                        inputs_tile = tf.tile(inputs, [1, mrt_samples])
                        targets_tile = tf.tile(targets, [1, mrt_samples])
                        inputs_reshape = tf.reshape(
                            inputs_tile,
                            [batch_size * mrt_samples, inputs_len])
                        targets_reshape = tf.reshape(
                            targets_tile,
                            [batch_size * mrt_samples, targets_len])
                        inputs_feed = tf.expand_dims(tf.expand_dims(
                            inputs_reshape, axis=-1),
                                                     axis=-1)
                        targets_feed = tf.expand_dims(tf.expand_dims(
                            targets_reshape, axis=-1),
                                                      axis=-1)
                        features["inputs"] = inputs_feed
                        features["targets"] = targets_feed
                    elif train_beam and train_beam != 1:  # beam search with hparams.train_beam size and return the top 1 sample
                        samples, _ = model_class._fast_decode(
                            features,
                            decode_length=50,
                            beam_size=hparams.train_beam)
                    else:
                        targets_beam = getattr(hparams, 'targets_beam', None)
                        if targets_beam:
                            targets_samples, _ = model_class._fast_decode(
                                features,
                                decode_length=50,
                                beam_size=4,
                                sampling_method='argmax')
                            targets_samples = tf.reshape(
                                targets_samples, [
                                    tf.shape(targets_samples)[0],
                                    tf.shape(targets_samples)[1], 1, 1
                                ])
                            features["targets"] = targets_samples
                        samples, _ = model_class._fast_decode(features,
                                                              decode_length=50)
                    samples = tf.expand_dims(samples, axis=-1)
                    samples = tf.expand_dims(
                        samples, axis=-1
                    )  # add two additional dimensions to make it compatible.
                else:  # work on multi GPU cards, only support slow sample
                    print("###Work on Multi GPU cards, Use Slow Decode.###")
                    samples, _, _ = model_class._slow_greedy_infer(
                        features,
                        decode_length=50)  # default decode_length = 50
                samples = tf.stop_gradient(samples)
                # calculate bleu score use metric_fn
                # train_metric_fn = "approx_bleu_train_score"
                train_metric_fn = metrics.METRICS_FNS[
                    metrics.Metrics.APPROX_BLEU_TRAIN]
                labels = features.get("targets", None)
                samples.set_shape([None, None, 1, 1])
                # haprams.delta_reward = True for delta reward; False for total reward
                metric_value = train_metric_fn(
                    samples, labels, delat_reward=hparams.delta_reward)
                metric_value = tf.stop_gradient(
                    metric_value)  # to be more strict of the gradient
                metric_value.set_shape([None, None, 1, 1])
                """Accodring to the metrics.py: The tf.metrics.mean function assures correct aggregation."""
                # metric_value is total_reward: scalar
                features["samples"] = samples
                features["values"] = metric_value
                # del samples
                # del labels
            sharded_logits, losses_dict = model_class.model_fn(
                features,
                skip=(skipping_is_on and skip_this_one),
                mrt=mrt_samples)
            # if hparams.rl:
            #     training_loss = losses_dict["training"] * metric_value  # losses_dict["training"]: [batch, timesteps]
            #     training_loss_sum = tf.reduce_sum(training_loss)  # sum the training_loss
            #     losses_dict["training"] = training_loss_sum  # log_prob * r (current r is total_reward)
        with tf.variable_scope("losses_avg"):
            total_loss, ops = 0.0, []
            for loss_key, loss_value in six.iteritems(losses_dict):
                if hparams.rl:
                    baseline_loss_weight = getattr(hparams,
                                                   'baseline_loss_weight', 1.0)
                    training_loss_weight = getattr(hparams,
                                                   'training_loss_weight', 1.0)
                    mle_training_loss_weight = getattr(
                        hparams, 'mle_training_loss_weight', 0.3)
                    if loss_key == "training":
                        loss_value = loss_value * training_loss_weight
                    elif loss_key == "training_baseline":
                        loss_value = loss_value * baseline_loss_weight
                    elif loss_key == "mle_training":
                        loss_value = loss_value * mle_training_loss_weight
                loss_name = "problem_%d/%s_loss" % (n, loss_key)
                loss_moving_avg = tf.get_variable(loss_name,
                                                  initializer=100.0,
                                                  trainable=False)
                loss_variable_names.append(loss_name)
                ops.append(
                    loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                           loss_value * 0.1))
                total_loss += loss_value
            try:  # Total loss avg might be reused or not, we try both.
                with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                    # Total loss was already constructed on input.
                    loss_moving_avg = tf.get_variable("problem_%d/total_loss" %
                                                      n)
            except ValueError:
                loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n,
                                                  initializer=100.0,
                                                  trainable=False)
            ops.append(
                loss_moving_avg.assign(loss_moving_avg * 0.9 +
                                       total_loss * 0.1))
        with tf.variable_scope("train_stats"):  # Count steps for this problem.
            problem_steps = tf.get_variable("problem_%d_steps" % n,
                                            initializer=0,
                                            trainable=False)
            ops.append(problem_steps.assign_add(1))
        with tf.control_dependencies(ops):  # Make sure the ops run.
            # Ensure the loss is a scalar here.
            total_loss = tf.reshape(total_loss, [],
                                    name="total_loss_control_id")
        return [total_loss, tf.concat(sharded_logits, 0)]

    model_output = input_fn_builder.cond_on_index(
        nth_model,
        index_tensor=features["problem_choice"],
        max_idx=len(hparams.problems) - 1)  # total_loss and shared_logits

    if mode == tf.estimator.ModeKeys.PREDICT:
        # If beam searching, model_output will be a dict with keys "outputs" and
        # "scores".
        if isinstance(model_output, dict):  # beam search
            outputs = model_output["outputs"]
            scores = model_output["scores"]
        else:
            outputs = model_output
            scores = None

        batched_problem_choice = (features["problem_choice"] * tf.ones(
            (tf.shape(features["inputs"])[0], ), dtype=tf.int32))
        predictions = {
            "outputs": outputs,
            "scores": scores,
            "inputs": features.get("inputs", None),
            "targets": features.get("infer_targets", None),
            "problem_choice": batched_problem_choice,
        }
        _del_dict_nones(predictions)  # delete the empty ones in predictions

        export_out = {"outputs": predictions["outputs"]}
        if "scores" in predictions:
            export_out["scores"] = predictions["scores"]

        return tf.estimator.EstimatorSpec(
            mode,
            predictions=predictions,
            export_outputs={
                "output": tf.estimator.export.PredictOutput(export_out)
            })

    total_loss, logits = model_output

    if mode == tf.estimator.ModeKeys.EVAL:
        eval_metrics_fns = metrics.create_evaluation_metrics(
            hparams.problem_instances, hparams)

        eval_metrics = {}
        for metric_name, metric_fn in six.iteritems(eval_metrics_fns):
            eval_metrics[metric_name] = metric_fn(logits, features)

        return tf.estimator.EstimatorSpec(mode,
                                          predictions={"predictions": logits},
                                          eval_metric_ops=eval_metrics,
                                          loss=total_loss)

    assert mode == tf.estimator.ModeKeys.TRAIN

    # Set learning rate
    learning_rate = hparams.learning_rate * optimize.learning_rate_decay(
        hparams,
        num_worker_replicas=worker_replicas,
        num_train_steps=train_steps)
    learning_rate /= math.sqrt(float(worker_replicas))

    # Get global step
    global_step = tf.train.get_or_create_global_step()

    # Some training statistics.
    with tf.name_scope("training_stats"):
        tf.summary.scalar("learning_rate", learning_rate)
        for n in xrange(len(hparams.problems)):
            names_and_vars = []
            with tf.variable_scope("losses_avg", reuse=True):
                total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
                names_and_vars.append(("total_loss", total_loss_var))
            with tf.variable_scope("losses_avg", reuse=True):
                for loss_name in loss_variable_names:
                    if loss_name.startswith("problem_%d/" % n):
                        loss_var = tf.get_variable(loss_name)
                        loss_suffix = loss_name[loss_name.index("/") + 1:]
                        names_and_vars.append((loss_suffix, loss_var))
            for (loss_name, loss_var) in names_and_vars:
                tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var)
            with tf.variable_scope("train_stats", reuse=True):
                nth_steps = tf.get_variable("problem_%d_steps" % n,
                                            dtype=tf.int32)
            tf.summary.scalar(
                "problem_%d_frequency" % n,
                tf.to_float(nth_steps) / (tf.to_float(global_step) + 1.0))

    # Add weight decay and noise.
    total_size, weight_decay_loss = 0, 0.0
    all_weights = {v.name: v for v in tf.trainable_variables()}
    for v_name in sorted(list(all_weights)):
        v = all_weights[v_name]
        v_size = int(np.prod(np.array(v.shape.as_list())))
        total_size += v_size
        if hparams.weight_decay > 0.0 and len(v.shape.as_list()) > 1:
            # Add weight regularization if set and the weight is not a bias (dim>1).
            with tf.device(v._ref().device):  # pylint: disable=protected-access
                v_loss = tf.nn.l2_loss(v) / v_size
            weight_decay_loss += v_loss
        is_body = len(v_name) > 5 and v_name[:5] == "body/"
        if hparams.weight_noise > 0.0 and is_body:
            # Add weight noise if set in hparams.
            with tf.device(v._ref().device):  # pylint: disable=protected-access
                scale = learning_rate * 0.001
                noise = tf.truncated_normal(
                    v.shape) * hparams.weight_noise * scale
                noise_op = v.assign_add(noise)
            with tf.control_dependencies([noise_op]):
                total_loss = tf.identity(total_loss)
    if hparams.weight_decay > 0.0:
        total_loss += weight_decay_loss * hparams.weight_decay

    # The new data reader occasionally emits very small batches, which
    # cause the examples in those batches to be grossly overweighted.
    # We decrease the loss proportionally to the ratio of the size of this
    # batch to the size of the largest training batch ever.
    # TODO(noam): to be more sophisticated, we could keep separate
    # maxima based on problem choice.
    max_nonpadding_var = tf.get_variable("max_nonpadding",
                                         shape=[],
                                         initializer=tf.ones_initializer(),
                                         trainable=False)
    max_nonpadding = tf.maximum(max_nonpadding_var, targets_nonpadding_tokens)
    with tf.control_dependencies(
        [tf.assign(max_nonpadding_var, max_nonpadding)]):
        small_batch_multiplier = targets_nonpadding_tokens / max_nonpadding
    tf.summary.scalar("small_batch_multiplier", small_batch_multiplier)
    total_loss *= small_batch_multiplier

    # Log variable sizes
    _log_variable_sizes(tf.trainable_variables(), "Trainable Variables")
    diet_vars = [
        v for v in tf.global_variables() if v.dtype == dtypes.float16_ref
    ]
    _log_variable_sizes(diet_vars, "Diet Variables")

    # Optimize
    train_op = optimize.optimize(total_loss, learning_rate, hparams)

    # Remove summaries that will fail to run because they are in conditionals.
    # TODO(cwhipkey): Test with this code removed, later in 2017.
    summaries = tf.get_collection_ref(tf.GraphKeys.SUMMARIES)
    for i in reversed(range(len(summaries))):
        if summaries[i].name.startswith("cond_"):
            del summaries[i]

    tf.logging.info("Global model_fn finished.")
    return tf.estimator.EstimatorSpec(
        mode,
        predictions={"problem_choice": features["problem_choice"]},
        loss=total_loss,
        train_op=train_op)
Exemplo n.º 17
0
def create_run_config(master="",
                      model_dir=None,
                      iterations_per_loop=1000,
                      num_shards=8,
                      log_device_placement=False,
                      save_checkpoints_steps=1000,
                      keep_checkpoint_max=20,
                      keep_checkpoint_every_n_hours=10000,
                      num_gpus=1,
                      gpu_order="",
                      shard_to_cpu=False,
                      num_async_replicas=1,
                      enable_graph_rewriter=False,
                      gpu_mem_fraction=0.95,
                      no_data_parallelism=False,
                      daisy_chain_variables=True,
                      schedule="continuous_train_and_eval",
                      worker_job="/job:localhost",
                      worker_id=0,
                      ps_replicas=0,
                      ps_job="/job:ps",
                      ps_gpu=0,
                      sync=False,
                      use_tpu=False):
  """Create RunConfig, TPUConfig, and Parallelism object."""
  session_config = create_session_config(
      log_device_placement=log_device_placement,
      enable_graph_rewriter=enable_graph_rewriter,
      gpu_mem_fraction=gpu_mem_fraction,
      use_tpu=use_tpu)
  session_config = tf.ConfigProto(
      allow_soft_placement=True, log_device_placement=log_device_placement)
  run_config_args = {
      "master": master,
      "model_dir": model_dir,
      "session_config": session_config,
      "save_summary_steps": 0,
      "save_checkpoints_steps": save_checkpoints_steps,
      "keep_checkpoint_max": keep_checkpoint_max,
      "keep_checkpoint_every_n_hours": keep_checkpoint_every_n_hours,
  }
  run_config_cls = tf.contrib.learn.RunConfig

  # If using TPU, use TPU RunConfig, add TPUConfig, and add additional args
  if use_tpu:
    run_config_cls = tf.contrib.tpu.RunConfig
    tpu_config = tf.contrib.tpu.TPUConfig(
        iterations_per_loop=iterations_per_loop,
        num_shards=num_shards,
        per_host_input_for_training=(num_shards <= 8))
    run_config_args["tpu_config"] = tpu_config

  config = run_config_cls(**run_config_args)

  # If not using TPU, add device info for data_parallelism
  config.use_tpu = use_tpu
  if not use_tpu:
    config.t2t_device_info = {
        "num_async_replicas": num_async_replicas,
    }
    if no_data_parallelism:
      config.data_parallelism = expert_utils.Parallelism([""])
    else:
      config.data_parallelism = devices.data_parallelism(
          daisy_chain_variables=daisy_chain_variables,
          ps_replicas=ps_replicas,
          ps_job=ps_job,
          ps_gpu=ps_gpu,
          schedule=schedule,
          sync=sync,
          worker_gpu=num_gpus,
          worker_replicas=num_async_replicas,
          worker_id=worker_id,
          gpu_order=gpu_order,
          locally_shard_to_cpu=shard_to_cpu,
          worker_job=worker_job)

  return config
Exemplo n.º 18
0
def create_run_config(model_name,
                      master="",
                      model_dir=None,
                      iterations_per_loop=1000,
                      num_shards=8,
                      log_device_placement=False,
                      save_checkpoints_steps=1000,
                      save_checkpoints_secs=None,
                      keep_checkpoint_max=20,
                      keep_checkpoint_every_n_hours=10000,
                      num_gpus=1,
                      gpu_order="",
                      num_async_replicas=1,
                      enable_graph_rewriter=False,
                      gpu_mem_fraction=0.95,
                      no_data_parallelism=False,
                      optionally_use_dist_strat=False,
                      daisy_chain_variables=True,
                      schedule="continuous_train_and_eval",
                      worker_job="/job:localhost",
                      worker_id=0,
                      ps_replicas=0,
                      ps_job="/job:ps",
                      ps_gpu=0,
                      random_seed=None,
                      sync=False,
                      tpu_infeed_sleep_secs=None,
                      use_tpu=False,
                      use_tpu_estimator=False,
                      xla_jit_level=tf.OptimizerOptions.OFF,
                      inter_op_parallelism_threads=0,
                      log_step_count_steps=100,
                      intra_op_parallelism_threads=0,
                      tpu_config_extra_kwargs=None,
                      cloud_tpu_name=""):
  """Create RunConfig, TPUConfig, and Parallelism object."""
  session_config = create_session_config(
      log_device_placement=log_device_placement,
      enable_graph_rewriter=enable_graph_rewriter,
      gpu_mem_fraction=gpu_mem_fraction,
      use_tpu=use_tpu,
      xla_jit_level=xla_jit_level,
      inter_op_parallelism_threads=inter_op_parallelism_threads,
      intra_op_parallelism_threads=intra_op_parallelism_threads)
  run_config_args = {
      "master": master,
      "evaluation_master": master,
      "model_dir": model_dir,
      "session_config": session_config,
      "save_summary_steps": 100,
      "save_checkpoints_steps": save_checkpoints_steps,
      "save_checkpoints_secs": save_checkpoints_secs,
      "keep_checkpoint_max": keep_checkpoint_max,
      "keep_checkpoint_every_n_hours": keep_checkpoint_every_n_hours,
      "tf_random_seed": random_seed,
      "log_step_count_steps": log_step_count_steps
  }
  if save_checkpoints_secs:
    del run_config_args["save_checkpoints_steps"]
  run_config_cls = tf.contrib.learn.RunConfig

  if use_tpu or use_tpu_estimator:
    # If using TPUEstimator, use TPU RunConfig, add TPUConfig, and add
    # additional args.
    tpu_config_kwargs = {
        "iterations_per_loop": iterations_per_loop,
        "num_shards": num_shards,
        "per_host_input_for_training": True,
        "initial_infeed_sleep_secs": tpu_infeed_sleep_secs,
    }
    if tpu_config_extra_kwargs is not None:
      tpu_config_kwargs.update(tpu_config_extra_kwargs)
    run_config_cls = tf.contrib.tpu.RunConfig
    tpu_config = tf.contrib.tpu.TPUConfig(
        **tpu_config_kwargs)
    run_config_args["tpu_config"] = tpu_config
    if not master and "KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS" in os.environ:
      # If running on TPU but no master is set and the KUBE env var is present
      # then we're running on ML Engine. Set the master.
      run_config_args["master"] = os.environ[
          "KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS"]
      run_config_args["evaluation_master"] = run_config_args["master"]
    elif not master and cloud_tpu_name:
      # Update run_config to use cluster instead of master/evaluation_master
      # as we need the cluster spec to use Cloud Pods
      tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
          cloud_tpu_name)
      run_config_args["cluster"] = tpu_cluster_resolver
      del run_config_args["master"]
      del run_config_args["evaluation_master"]
  elif is_cloud_async_distributed():
    run_config_cls = tf.estimator.RunConfig
    del run_config_args["master"]
    del run_config_args["evaluation_master"]

  config = run_config_cls(**run_config_args)

  # If not using TPU, add device info for data_parallelism
  config.use_tpu = use_tpu
  if not use_tpu:
    config.t2t_device_info = {
        "num_async_replicas": num_async_replicas,
    }
    use_distribution_strategy = (
        optionally_use_dist_strat and
        t2t_model.T2TModel.has_symmetric_shards(model_name) and
        not no_data_parallelism and ps_replicas == 0 and ps_gpu == 0 and
        num_async_replicas == 1)

    if use_distribution_strategy:
      tf.logging.info(
          "Configuring MirroredStrategy DistributionStrategy to replicate the "
          "model."
      )
      distribution = tf.contrib.distribute.MirroredStrategy()
      config = config.replace(train_distribute=distribution)
      config.data_parallelism = None
    else:
      tf.logging.info("Configuring DataParallelism to replicate the model.")
      config.data_parallelism = devices.data_parallelism(
          daisy_chain_variables=daisy_chain_variables,
          ps_replicas=ps_replicas,
          ps_job=ps_job,
          ps_gpu=ps_gpu,
          schedule=schedule,
          sync=sync,
          worker_gpu=num_gpus,
          worker_replicas=num_async_replicas,
          worker_id=worker_id,
          gpu_order=gpu_order,
          worker_job=worker_job,
          no_data_parallelism=no_data_parallelism)

  return config
Exemplo n.º 19
0
def create_run_config(master="",
                      model_dir=None,
                      iterations_per_loop=1000,
                      num_shards=8,
                      log_device_placement=False,
                      save_checkpoints_steps=1000,
                      save_checkpoints_secs=None,
                      keep_checkpoint_max=20,
                      keep_checkpoint_every_n_hours=10000,
                      num_gpus=1,
                      gpu_order="",
                      shard_to_cpu=False,
                      num_async_replicas=1,
                      enable_graph_rewriter=False,
                      gpu_mem_fraction=0.95,
                      no_data_parallelism=False,
                      daisy_chain_variables=True,
                      schedule="continuous_train_and_eval",
                      worker_job="/job:localhost",
                      worker_id=0,
                      ps_replicas=0,
                      ps_job="/job:ps",
                      ps_gpu=0,
                      random_seed=None,
                      sync=False,
                      tpu_infeed_sleep_secs=None,
                      use_tpu=False,
                      inter_op_parallelism_threads=0,
                      log_step_count_steps=100,
                      intra_op_parallelism_threads=0,
                      tpu_config_extra_kwargs=None):
  """Create RunConfig, TPUConfig, and Parallelism object."""
  session_config = create_session_config(
      log_device_placement=log_device_placement,
      enable_graph_rewriter=enable_graph_rewriter,
      gpu_mem_fraction=gpu_mem_fraction,
      use_tpu=use_tpu,
      inter_op_parallelism_threads=inter_op_parallelism_threads,
      intra_op_parallelism_threads=intra_op_parallelism_threads)
  run_config_args = {
      "master": master,
      "evaluation_master": master,
      "model_dir": model_dir,
      "session_config": session_config,
      "save_summary_steps": 100,
      "save_checkpoints_steps": save_checkpoints_steps,
      "keep_checkpoint_max": keep_checkpoint_max,
      "keep_checkpoint_every_n_hours": keep_checkpoint_every_n_hours,
      "tf_random_seed": random_seed,
      "log_step_count_steps": log_step_count_steps
  }
  if save_checkpoints_secs:
    del run_config_args["save_checkpoints_steps"]
    run_config_args["save_checkpoints_secs"] = save_checkpoints_secs
  run_config_cls = tf.contrib.learn.RunConfig

  # If using TPU, use TPU RunConfig, add TPUConfig, and add additional args
  if use_tpu:
    if tpu_config_extra_kwargs is None:
      tpu_config_extra_kwargs = {}
    run_config_cls = tf.contrib.tpu.RunConfig
    tpu_config = tf.contrib.tpu.TPUConfig(
        iterations_per_loop=iterations_per_loop,
        num_shards=num_shards,
        per_host_input_for_training=True,
        initial_infeed_sleep_secs=tpu_infeed_sleep_secs,
        **tpu_config_extra_kwargs)
    run_config_args["tpu_config"] = tpu_config

  config = run_config_cls(**run_config_args)

  # If not using TPU, add device info for data_parallelism
  config.use_tpu = use_tpu
  if not use_tpu:
    config.t2t_device_info = {
        "num_async_replicas": num_async_replicas,
    }
    config.data_parallelism = devices.data_parallelism(
        daisy_chain_variables=daisy_chain_variables,
        ps_replicas=ps_replicas,
        ps_job=ps_job,
        ps_gpu=ps_gpu,
        schedule=schedule,
        sync=sync,
        worker_gpu=num_gpus,
        worker_replicas=num_async_replicas,
        worker_id=worker_id,
        gpu_order=gpu_order,
        locally_shard_to_cpu=shard_to_cpu,
        worker_job=worker_job,
        no_data_parallelism=no_data_parallelism)

  return config
Exemplo n.º 20
0
def decode_from_dataset(estimator,
                        problem_names,
                        return_beams=False,
                        beam_size=1,
                        max_predictions=-1,
                        decode_to_file=None,
                        save_images=False,
                        identity_output=False):
    tf.logging.info("Performing local inference from dataset for %s.",
                    str(problem_names))
    hparams = estimator.hparams

    for problem_idx, problem_name in enumerate(problem_names):
        # Build the inference input function
        infer_problems_data = data_reader.get_data_filepatterns(
            problem_name, hparams.data_dir, tf.contrib.learn.ModeKeys.INFER)

        infer_input_fn = input_fn_builder.build_input_fn(
            mode=tf.contrib.learn.ModeKeys.INFER,
            hparams=hparams,
            data_file_patterns=infer_problems_data,
            num_datashards=devices.data_parallelism().n,
            fixed_problem=problem_idx)

        # Get the predictions as an iterable
        predictions = estimator.predict(input_fn=infer_input_fn,
                                        as_iterable=True)

        # Prepare output file writers if decode_to_file passed
        if decode_to_file:
            output_filepath = decode_to_file + ".outputs." + problem_name
            target_filepath = decode_to_file + ".targets." + problem_name

            output_file = tf.gfile.Open(output_filepath, "w")
            target_file = tf.gfile.Open(target_filepath, "w")

        problem_hparams = hparams.problems[problem_idx]
        inputs_vocab = problem_hparams.vocabulary.get("inputs", None)
        targets_vocab = problem_hparams.vocabulary["targets"]
        for num_predictions, prediction in enumerate(predictions):
            inputs = prediction["inputs"]
            targets = prediction["targets"]
            outputs = prediction["outputs"]

            # Log predictions
            decoded_outputs = []
            if return_beams:
                output_beams = np.split(outputs, beam_size, axis=0)
                for i, beam in enumerate(output_beams):
                    tf.logging.info("BEAM %d:" % i)
                    decoded = _decode_from_dataset_log_results(
                        inputs, targets, beam, problem_name, num_predictions,
                        inputs_vocab, targets_vocab, save_images,
                        estimator.model_dir, identity_output)
                    decoded_outputs.append(decoded)
            else:
                decoded = _decode_from_dataset_log_results(
                    inputs, targets, outputs, problem_name, num_predictions,
                    inputs_vocab, targets_vocab, save_images,
                    estimator.model_dir, identity_output)
                decoded_outputs.append(decoded)

            # Write out predictions if decode_to_file passed
            if decode_to_file:
                for decoded_output, decoded_target in decoded_outputs:
                    output_file.write(str(decoded_output) + "\n")
                    target_file.write(str(decoded_target) + "\n")

            if max_predictions >= 0 and num_predictions >= max_predictions:
                break

        if decode_to_file:
            output_file.close()
            target_file.close()

        tf.logging.info("Completed inference on %d samples." % num_predictions)  # pylint: disable=undefined-loop-variable
Exemplo n.º 21
0
def decode_from_dataset(estimator,
                        problem_names,
                        decode_hp,
                        decode_to_file=None):
    tf.logging.info("Performing local inference from dataset for %s.",
                    str(problem_names))
    hparams = estimator.params

    for problem_idx, problem_name in enumerate(problem_names):
        # Build the inference input function
        infer_problems_data = data_reader.get_data_filepatterns(
            problem_name, hparams.data_dir, tf.estimator.ModeKeys.PREDICT)

        infer_input_fn = input_fn_builder.build_input_fn(
            mode=tf.estimator.ModeKeys.PREDICT,
            hparams=hparams,
            data_file_patterns=infer_problems_data,
            num_datashards=devices.data_parallelism().n,
            fixed_problem=problem_idx)

        # Get the predictions as an iterable
        predictions = estimator.predict(infer_input_fn)

        # Prepare output file writers if decode_to_file passed
        if decode_to_file:
            output_filepath = _decode_filename(decode_to_file, problem_name,
                                               decode_hp)
            parts = output_filepath.split(".")
            parts[-1] = "targets"
            target_filepath = ".".join(parts)

            output_file = tf.gfile.Open(output_filepath, "w")
            target_file = tf.gfile.Open(target_filepath, "w")

        problem_hparams = hparams.problems[problem_idx]
        inputs_vocab = problem_hparams.vocabulary.get("inputs", None)
        targets_vocab = problem_hparams.vocabulary["targets"]
        for num_predictions, prediction in enumerate(predictions):
            inputs = prediction["inputs"]
            targets = prediction["targets"]
            outputs = prediction["outputs"]

            # Log predictions
            decoded_outputs = []
            if decode_hp.return_beams:
                output_beams = np.split(outputs, decode_hp.beam_size, axis=0)
                for i, beam in enumerate(output_beams):
                    tf.logging.info("BEAM %d:" % i)
                    decoded = log_decode_results(
                        inputs,
                        beam,
                        problem_name,
                        num_predictions,
                        inputs_vocab,
                        targets_vocab,
                        save_images=decode_hp.save_images,
                        model_dir=estimator.model_dir,
                        identity_output=decode_hp.identity_output,
                        targets=targets)
                    decoded_outputs.append(decoded)
            else:
                decoded = log_decode_results(
                    inputs,
                    outputs,
                    problem_name,
                    num_predictions,
                    inputs_vocab,
                    targets_vocab,
                    save_images=decode_hp.save_images,
                    model_dir=estimator.model_dir,
                    identity_output=decode_hp.identity_output,
                    targets=targets)
                decoded_outputs.append(decoded)

            # Write out predictions if decode_to_file passed
            if decode_to_file:
                for decoded_output, decoded_target in decoded_outputs:
                    output_file.write(str(decoded_output) + "\n")
                    target_file.write(str(decoded_target) + "\n")

            if (decode_hp.num_samples >= 0
                    and num_predictions >= decode_hp.num_samples):
                break

        if decode_to_file:
            output_file.close()
            target_file.close()

        tf.logging.info("Completed inference on %d samples." % num_predictions)  # pylint: disable=undefined-loop-variable
Exemplo n.º 22
0
def create_run_config(master="",
                      model_dir=None,
                      iterations_per_loop=1000,
                      num_shards=8,
                      log_device_placement=False,
                      save_checkpoints_steps=1000,
                      save_checkpoints_secs=None,
                      keep_checkpoint_max=20,
                      keep_checkpoint_every_n_hours=10000,
                      num_gpus=1,
                      gpu_order="",
                      shard_to_cpu=False,
                      num_async_replicas=1,
                      enable_graph_rewriter=False,
                      gpu_mem_fraction=0.95,
                      no_data_parallelism=False,
                      daisy_chain_variables=True,
                      schedule="continuous_train_and_eval",
                      worker_job="/job:localhost",
                      worker_id=0,
                      ps_replicas=0,
                      ps_job="/job:ps",
                      ps_gpu=0,
                      random_seed=None,
                      sync=False,
                      tpu_infeed_sleep_secs=None,
                      use_tpu=False,
                      inter_op_parallelism_threads=0,
                      log_step_count_steps=100,
                      intra_op_parallelism_threads=0,
                      tpu_config_extra_kwargs=None):
    """Create RunConfig, TPUConfig, and Parallelism object."""
    session_config = create_session_config(
        log_device_placement=log_device_placement,
        enable_graph_rewriter=enable_graph_rewriter,
        gpu_mem_fraction=gpu_mem_fraction,
        use_tpu=use_tpu,
        inter_op_parallelism_threads=inter_op_parallelism_threads,
        intra_op_parallelism_threads=intra_op_parallelism_threads)
    run_config_args = {
        "master": master,
        "evaluation_master": master,
        "model_dir": model_dir,
        "session_config": session_config,
        "save_summary_steps": 100,
        "save_checkpoints_steps": save_checkpoints_steps,
        "save_checkpoints_secs": save_checkpoints_secs,
        "keep_checkpoint_max": keep_checkpoint_max,
        "keep_checkpoint_every_n_hours": keep_checkpoint_every_n_hours,
        "tf_random_seed": random_seed,
        "log_step_count_steps": log_step_count_steps
    }
    if save_checkpoints_secs:
        del run_config_args["save_checkpoints_steps"]
    run_config_cls = tf.contrib.learn.RunConfig

    # If using TPU, use TPU RunConfig, add TPUConfig, and add additional args
    if use_tpu:
        tpu_config_kwargs = {
            "iterations_per_loop": iterations_per_loop,
            "num_shards": num_shards,
            "per_host_input_for_training": True,
            "initial_infeed_sleep_secs": tpu_infeed_sleep_secs,
        }
        if tpu_config_extra_kwargs is not None:
            tpu_config_kwargs.update(tpu_config_extra_kwargs)
        run_config_cls = tf.contrib.tpu.RunConfig
        tpu_config = tf.contrib.tpu.TPUConfig(**tpu_config_kwargs)
        run_config_args["tpu_config"] = tpu_config

    config = run_config_cls(**run_config_args)

    # If not using TPU, add device info for data_parallelism
    config.use_tpu = use_tpu
    if not use_tpu:
        config.t2t_device_info = {
            "num_async_replicas": num_async_replicas,
        }
        config.data_parallelism = devices.data_parallelism(
            daisy_chain_variables=daisy_chain_variables,
            ps_replicas=ps_replicas,
            ps_job=ps_job,
            ps_gpu=ps_gpu,
            schedule=schedule,
            sync=sync,
            worker_gpu=num_gpus,
            worker_replicas=num_async_replicas,
            worker_id=worker_id,
            gpu_order=gpu_order,
            locally_shard_to_cpu=shard_to_cpu,
            worker_job=worker_job,
            no_data_parallelism=no_data_parallelism)

    return config
Exemplo n.º 23
0
def decode_from_dataset(estimator,
                        problem_names,
                        decode_hp,
                        decode_to_file=None,
                        dataset_split=None):
  tf.logging.info("Performing local inference from dataset for %s.",
                  str(problem_names))
  hparams = estimator.params
  # We assume that worker_id corresponds to shard number.
  shard = decode_hp.shard_id if decode_hp.shards > 1 else None

  for problem_idx, problem_name in enumerate(problem_names):
    # Build the inference input function
    infer_input_fn = input_fn_builder.build_input_fn(
        mode=tf.estimator.ModeKeys.PREDICT,
        hparams=hparams,
        data_dir=hparams.data_dir,
        num_datashards=devices.data_parallelism(hparams).n,
        fixed_problem=problem_idx,
        batch_size=decode_hp.batch_size,
        dataset_split=dataset_split,
        shard=shard)

    # Get the predictions as an iterable
    predictions = estimator.predict(infer_input_fn)

    # Prepare output file writers if decode_to_file passed
    if decode_to_file:
      if decode_hp.shards > 1:
        decode_filename = decode_to_file + ("%.2d" % decode_hp.shard_id)
      else:
        decode_filename = decode_to_file
      output_filepath = _decode_filename(decode_filename, problem_name,
                                         decode_hp)
      parts = output_filepath.split(".")
      parts[-1] = "targets"
      target_filepath = ".".join(parts)

      output_file = tf.gfile.Open(output_filepath, "w")
      target_file = tf.gfile.Open(target_filepath, "w")

    problem_hparams = hparams.problems[problem_idx]
    # Inputs vocabulary is set to targets if there are no inputs in the problem,
    # e.g., for language models where the inputs are just a prefix of targets.
    has_input = "inputs" in problem_hparams.vocabulary
    inputs_vocab_key = "inputs" if has_input else "targets"
    inputs_vocab = problem_hparams.vocabulary[inputs_vocab_key]
    targets_vocab = problem_hparams.vocabulary["targets"]
    for num_predictions, prediction in enumerate(predictions):
      num_predictions += 1
      inputs = prediction["inputs"]
      targets = prediction["targets"]
      outputs = prediction["outputs"]

      # Log predictions
      decoded_outputs = []
      if decode_hp.return_beams:
        output_beams = np.split(outputs, decode_hp.beam_size, axis=0)
        for i, beam in enumerate(output_beams):
          tf.logging.info("BEAM %d:" % i)
          decoded = log_decode_results(
              inputs,
              beam,
              problem_name,
              num_predictions,
              inputs_vocab,
              targets_vocab,
              save_images=decode_hp.save_images,
              model_dir=estimator.model_dir,
              identity_output=decode_hp.identity_output,
              targets=targets)
          decoded_outputs.append(decoded)
      else:
        decoded = log_decode_results(
            inputs,
            outputs,
            problem_name,
            num_predictions,
            inputs_vocab,
            targets_vocab,
            save_images=decode_hp.save_images,
            model_dir=estimator.model_dir,
            identity_output=decode_hp.identity_output,
            targets=targets)
        decoded_outputs.append(decoded)

      # Write out predictions if decode_to_file passed
      if decode_to_file:
        for decoded_output, decoded_target in decoded_outputs:
          output_file.write(str(decoded_output) + decode_hp.delimiter)
          target_file.write(str(decoded_target) + decode_hp.delimiter)

      if (decode_hp.num_samples >= 0 and
          num_predictions >= decode_hp.num_samples):
        break

    if decode_to_file:
      output_file.close()
      target_file.close()

    tf.logging.info("Completed inference on %d samples." % num_predictions)  # pylint: disable=undefined-loop-variable