예제 #1
0
def features_for_problem(problem_instance,
                         p_hparams,
                         hparams,
                         data_dir,
                         num_datashards,
                         mode,
                         batch_size=None,
                         dataset_split=None,
                         shard=None,
                         name="problem_inputs"):
  """Feature map for Problem."""
  with tf.name_scope(name):
    with tf.device("/cpu:0"):  # Input reading on CPU
      capacity = (p_hparams.max_expected_batch_size_per_shard * num_datashards)
      batching_scheme = data_reader.hparams_to_batching_scheme(
          hparams,
          shard_multiplier=num_datashards,
          drop_long_sequences=(mode == tf.estimator.ModeKeys.TRAIN or
                               hparams.eval_drop_long_sequences),
          length_multiplier=(p_hparams.batch_size_multiplier))
      if batch_size:
        # If batch_size is fixed, use a single input bucket
        batching_scheme["batch_sizes"] = [batch_size]
        batching_scheme["boundaries"] = []
        # Log new batching scheme if updated
        tf.logging.info("Updated batching_scheme = %s", batching_scheme)
      feature_map = data_reader.input_pipeline(
          problem_instance,
          data_dir,
          capacity,
          mode,
          hparams,
          batching_scheme,
          dataset_split=dataset_split,
          shard=shard)

  # Ensure inputs and targets are proper rank.
  if problem_instance.has_inputs:  # always true for translation
    while len(feature_map["inputs"].get_shape()) != 4:
      feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1)
  while len(feature_map["targets"].get_shape()) != 4:
    feature_map["targets"] = tf.expand_dims(feature_map["targets"], axis=-1)

  if problem_instance.has_inputs:
    feature_map["input_space_id"] = tf.constant(p_hparams.input_space_id)
  feature_map["target_space_id"] = tf.constant(p_hparams.target_space_id)
  return feature_map
예제 #2
0
def features_for_problem(problem_instance,
                         p_hparams,
                         hparams,
                         data_dir,
                         num_datashards,
                         mode,
                         batch_size=None,
                         dataset_split=None,
                         shard=None,
                         name="problem_inputs"):
  """Feature map for Problem."""
  with tf.name_scope(name):
    with tf.device("/cpu:0"):  # Input reading on CPU
      capacity = (p_hparams.max_expected_batch_size_per_shard * num_datashards)
      batching_scheme = data_reader.hparams_to_batching_scheme(
          hparams,
          shard_multiplier=num_datashards,
          drop_long_sequences=(mode == tf.estimator.ModeKeys.TRAIN or
                               hparams.eval_drop_long_sequences),
          length_multiplier=(p_hparams.batch_size_multiplier))
      if batch_size:
        # If batch_size is fixed, use a single input bucket
        batching_scheme["batch_sizes"] = [batch_size]
        batching_scheme["boundaries"] = []
        # Log new batching scheme if updated
        tf.logging.info("Updated batching_scheme = %s", batching_scheme)
      feature_map = data_reader.input_pipeline(
          problem_instance,
          data_dir,
          capacity,
          mode,
          hparams,
          batching_scheme,
          dataset_split=dataset_split,
          shard=shard)

  # Ensure inputs and targets are proper rank.
  if problem_instance.has_inputs:
    while len(feature_map["inputs"].get_shape()) != 4:
      feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1)
  while len(feature_map["targets"].get_shape()) != 4:
    feature_map["targets"] = tf.expand_dims(feature_map["targets"], axis=-1)

  if problem_instance.has_inputs:
    feature_map["input_space_id"] = tf.constant(p_hparams.input_space_id)
  feature_map["target_space_id"] = tf.constant(p_hparams.target_space_id)
  return feature_map
예제 #3
0
def features_for_problem(problem_instance,
                         p_hparams,
                         hparams,
                         data_filepatterns,
                         num_datashards,
                         mode,
                         name="problem_inputs"):
    """Feature map for Problem."""
    with tf.name_scope(name):
        with tf.device("/cpu:0"):  # Input reading on CPU
            capacity = (p_hparams.max_expected_batch_size_per_shard *
                        num_datashards)
            feature_map = data_reader.input_pipeline(
                problem_instance, data_filepatterns, capacity, mode, hparams,
                data_reader.hparams_to_batching_scheme(
                    hparams,
                    shard_multiplier=num_datashards,
                    drop_long_sequences=(mode == tf.estimator.ModeKeys.TRAIN
                                         or hparams.eval_drop_long_sequences),
                    length_multiplier=(p_hparams.batch_size_multiplier)))

    # Reverse inputs and targets features if the problem was reversed.
    if problem_instance is not None:
        problem_instance.maybe_reverse_features(feature_map)
        problem_instance.maybe_copy_features(feature_map)
    else:
        if p_hparams.was_reversed:
            inputs = feature_map["inputs"]
            targets = feature_map["targets"]
            feature_map["inputs"] = targets
            feature_map["targets"] = inputs
        # Use the inputs as the targets if the problem is a copy problem.
        if p_hparams.was_copy:
            feature_map["targets"] = feature_map["inputs"]

    # Ensure inputs and targets are proper rank.
    while len(feature_map["inputs"].get_shape()) != 4:
        feature_map["inputs"] = tf.expand_dims(feature_map["inputs"], axis=-1)
    while len(feature_map["targets"].get_shape()) != 4:
        feature_map["targets"] = tf.expand_dims(feature_map["targets"],
                                                axis=-1)

    feature_map["input_space_id"] = tf.constant(p_hparams.input_space_id)
    feature_map["target_space_id"] = tf.constant(p_hparams.target_space_id)
    return feature_map
예제 #4
0
    def input_fn(self,  # noqa: C901
                 mode,
                 hparams,
                 data_dir=None,
                 params=None,
                 config=None,
                 dataset_kwargs=None):
        """Builds input pipeline for problem.

        Args:
            mode: tf.estimator.ModeKeys
            hparams: HParams, model hparams
            data_dir: str, data directory; if None, will use hparams.data_dir
            params: dict, may include "batch_size"
            config: RunConfig; should have the data_parallelism attribute if not using
                TPU
            dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
                method when called

        Returns:
            (features_dict<str name, Tensor feature>, Tensor targets)
        """
        partition_id, num_partitions = self._dataset_partition(mode, config)

        is_training = mode == tf.estimator.ModeKeys.TRAIN
        if config and config.use_tpu:
            num_threads = 64
        else:
            num_threads = 4 if is_training else 1

        max_length = self.max_length(hparams)

        def tpu_valid_size(example):
            return data_reader.example_valid_size(
                example,
                hparams.min_length,
                max_length
            )

        def gpu_valid_size(example):
            drop_long_sequences = is_training or hparams.eval_drop_long_sequences
            return data_reader.example_valid_size(
                example,
                hparams.min_length,
                max_length
                if drop_long_sequences else 10**9
            )

        def define_shapes(example):
            batch_size = config and config.use_tpu and params["batch_size"]
            return standardize_shapes(example, batch_size=batch_size)

        # Read and preprocess
        data_dir = data_dir or hparams.data_dir

        dataset_kwargs = dataset_kwargs or {}
        dataset_kwargs.update({
                "mode": mode,
                "data_dir": data_dir,
                "num_threads": num_threads,
                "hparams": hparams,
                "partition_id": partition_id,
                "num_partitions": num_partitions,
        })

        dataset = self.dataset(**dataset_kwargs)
        if is_training:
            # Repeat and skip a random number of records
            dataset = dataset.repeat()
            data_files = tf.contrib.slim.parallel_reader.get_data_files(
                    self.filepattern(data_dir, mode))
            #    In continuous_train_and_eval when switching between train and
            #    eval, this input_fn method gets called multiple times and it
            #    would give you the exact same samples from the last call
            #    (because the Graph seed is set). So this skip gives you some
            #    shuffling.
            dataset = skip_random_fraction(dataset, data_files[0])

        dataset = dataset.map(
                data_reader.cast_int64_to_int32, num_parallel_calls=num_threads)

        if self.batch_size_means_tokens:
            batch_size_means_tokens = True
        else:
            if _are_shapes_fully_defined(dataset.output_shapes):
                batch_size_means_tokens = False
            else:
                tf.logging.warning(
                        "Shapes are not fully defined. Assuming batch_size means tokens. "
                        "Override batch_size_means_tokens() "
                        "in your problem subclass if this is undesired behavior.")
                batch_size_means_tokens = True

        # Batching
        if not batch_size_means_tokens:
            # Batch size means examples per datashard.
            if config and config.use_tpu:
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                dataset = dataset.apply(
                        tf.contrib.data.batch_and_drop_remainder(batch_size))
            else:
                num_shards = (config and config.data_parallelism.n) or 1
                batch_size = hparams.batch_size * num_shards
                dataset = dataset.batch(batch_size)
        else:
            # batch_size means tokens per datashard
            if config and config.use_tpu:
                dataset = dataset.filter(tpu_valid_size)
                padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams)
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                dataset = dataset.apply(
                        tf.contrib.data.padded_batch_and_drop_remainder(
                                batch_size, padded_shapes))
            else:
                # On GPU, bucket by length
                dataset = dataset.filter(gpu_valid_size)
                batching_scheme = data_reader.hparams_to_batching_scheme(
                        hparams,
                        shard_multiplier=(config and config.data_parallelism.n) or 1,
                        length_multiplier=self.get_hparams().batch_size_multiplier)
                if hparams.use_fixed_batch_size:
                    # Here    batch_size really means examples per datashard.
                    batching_scheme["batch_sizes"] = [hparams.batch_size]
                    batching_scheme["boundaries"] = []
                dataset = data_reader.bucket_by_sequence_length(
                        dataset, data_reader.example_length, batching_scheme["boundaries"],
                        batching_scheme["batch_sizes"])

                if not is_training:

                    def _pad_batch(features):
                        if not config or config.data_parallelism.n <= 1:
                            return features
                        tf.logging.warn(
                                "Padding the batch to ensure that remainder eval batches have "
                                "a batch size divisible by the number of data shards. This may "
                                "lead to incorrect metrics for non-zero-padded features, e.g. "
                                "images. Use a single datashard (i.e. 1 GPU) in that case.")
                        return pad_batch(features, config.data_parallelism.n)

                    dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads)

        dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)
        dataset = dataset.prefetch(2)
        features = dataset.make_one_shot_iterator().get_next()
        if not config or not config.use_tpu:
            _summarize_features(features, (config and config.data_parallelism.n) or 1)

        if mode == tf.estimator.ModeKeys.PREDICT:
            features["infer_targets"] = features["targets"]
            features["targets"] = None
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner. DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS, data_reader.DummyQueueRunner())

        return features, features["targets"]
예제 #5
0
    def input_fn(self,
                 mode,
                 hparams,
                 data_dir=None,
                 params=None,
                 config=None,
                 force_repeat=False,
                 prevent_repeat=False,
                 dataset_kwargs=None):
        """Builds input pipeline for problem.

    Args:
      mode: tf.estimator.ModeKeys
      hparams: HParams, model hparams
      data_dir: str, data directory; if None, will use hparams.data_dir
      params: dict, may include "batch_size"
      config: RunConfig; should have the data_parallelism attribute if not using
        TPU
      force_repeat: bool, whether to repeat the data even if not training
      prevent_repeat: bool, whether to not repeat when in training mode.
        Overrides force_repeat.
      dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
        method when called

    Returns:
      (features_dict<str name, Tensor feature>, Tensor targets)
    """
        partition_id, num_partitions = self._dataset_partition(mode, config)

        is_training = mode == tf.estimator.ModeKeys.TRAIN
        if config and config.use_tpu:
            num_threads = 64
        else:
            num_threads = cpu_count() if is_training else 1

        if config and hasattr(config,
                              "data_parallelism") and config.data_parallelism:
            num_shards = config.data_parallelism.n
        else:
            num_shards = 1

        max_length = self.max_length(hparams)
        mlperf_log.transformer_print(key=mlperf_log.INPUT_MAX_LENGTH,
                                     value=max_length)

        def tpu_valid_size(example):
            return data_reader.example_valid_size(example, hparams.min_length,
                                                  max_length)

        def gpu_valid_size(example):
            drop_long_sequences = is_training or hparams.eval_drop_long_sequences
            return data_reader.example_valid_size(
                example, hparams.min_length,
                max_length if drop_long_sequences else 10**9)

        def define_shapes(example):
            batch_size = config and config.use_tpu and params["batch_size"]
            return standardize_shapes(example, batch_size=batch_size)

        # Read and preprocess
        data_dir = data_dir or (hasattr(hparams, "data_dir")
                                and hparams.data_dir)

        dataset_kwargs = dataset_kwargs or {}
        dataset_kwargs.update({
            "mode": mode,
            "data_dir": data_dir,
            "num_threads": num_threads,
            "hparams": hparams,
            "partition_id": partition_id,
            "num_partitions": num_partitions,
        })

        dataset = self.dataset(**dataset_kwargs)
        if (force_repeat or is_training) and not prevent_repeat:
            # Repeat and skip a random number of records
            dataset = dataset.repeat()

        if is_training and self.skip_random_fraction_when_training:
            data_files = tf.contrib.slim.parallel_reader.get_data_files(
                self.filepattern(data_dir, mode))
            #  In continuous_train_and_eval when switching between train and
            #  eval, this input_fn method gets called multiple times and it
            #  would give you the exact same samples from the last call
            #  (because the Graph seed is set). So this skip gives you some
            #  shuffling.
            dataset = skip_random_fraction(dataset, data_files[0])

        dataset = dataset.map(data_reader.cast_ints_to_int32,
                              num_parallel_calls=num_threads)

        if self.batch_size_means_tokens:
            batch_size_means_tokens = True
        else:
            if _are_shapes_fully_defined(dataset.output_shapes):
                batch_size_means_tokens = False
            else:
                tf.logging.warning(
                    "Shapes are not fully defined. Assuming batch_size means tokens."
                )
                batch_size_means_tokens = True

        # Batching
        if not batch_size_means_tokens:
            # Batch size means examples per datashard.
            if config and config.use_tpu:
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                dataset = dataset.batch(batch_size, drop_remainder=True)
            else:
                batch_size = hparams.batch_size * num_shards
                dataset = dataset.batch(batch_size)
        else:
            # batch_size means tokens per datashard
            if config and config.use_tpu:
                dataset = dataset.filter(tpu_valid_size)
                padded_shapes = self._pad_for_tpu(dataset.output_shapes,
                                                  hparams)
                # on TPU, we use params["batch_size"], which specifies the number of
                # examples across all datashards
                batch_size = params["batch_size"]
                if hparams.pad_batch:
                    tf.logging.warn(
                        "Padding the batch to ensure that remainder eval batches are "
                        "processed. This may lead to incorrect metrics for "
                        "non-zero-padded features, e.g. images. Use a smaller batch "
                        "size that has no remainder in that case.")
                    dataset = dataset.padded_batch(batch_size,
                                                   padded_shapes,
                                                   drop_remainder=False)
                    dataset = dataset.map(functools.partial(
                        pad_batch, batch_multiple=batch_size),
                                          num_parallel_calls=num_threads)
                else:
                    dataset = dataset.padded_batch(batch_size,
                                                   padded_shapes,
                                                   drop_remainder=True)
            else:
                # On GPU, bucket by length
                dataset = dataset.filter(gpu_valid_size)
                batching_scheme = data_reader.hparams_to_batching_scheme(
                    hparams,
                    shard_multiplier=num_shards,
                    length_multiplier=self.get_hparams().batch_size_multiplier)
                if hparams.use_fixed_batch_size:
                    # Here  batch_size really means examples per datashard.
                    batching_scheme["batch_sizes"] = [hparams.batch_size]
                    batching_scheme["boundaries"] = []
                dataset = dataset.apply(
                    tf.contrib.data.bucket_by_sequence_length(
                        data_reader.example_length,
                        batching_scheme["boundaries"],
                        batching_scheme["batch_sizes"]))

                if not is_training:
                    batch_multiple = num_shards
                    if hparams.use_fixed_batch_size:
                        # Make sure the last batch has the same fixed size as the rest.
                        batch_multiple *= hparams.batch_size
                    if batch_multiple > 1:
                        tf.logging.warn(
                            "Padding the batch to ensure that remainder eval batches have "
                            "a batch size divisible by the number of data shards. This may "
                            "lead to incorrect metrics for non-zero-padded features, e.g. "
                            "images. Use a single datashard (i.e. 1 GPU) in that case."
                        )
                        dataset = dataset.map(functools.partial(
                            pad_batch, batch_multiple=batch_multiple),
                                              num_parallel_calls=num_threads)

        dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)

        # Add shuffling for training batches. This is necessary along with record
        # level shuffling in the dataset generation. Record shuffling will shuffle
        # the examples. However, in some cases, it's possible that the shuffle
        # buffer size for record shuffling is smaller than the batch size. In such
        # cases, adding batch shuffling ensures that the data is in random order
        # during training
        if (is_training and hasattr(hparams, "batch_shuffle_size")
                and hparams.batch_shuffle_size):
            dataset = dataset.shuffle(hparams.batch_shuffle_size)

        def prepare_for_output(example):
            if not config or not config.use_tpu:
                _summarize_features(example, num_shards)
            if mode == tf.estimator.ModeKeys.PREDICT:
                example["infer_targets"] = example.pop("targets")
                return example
            else:
                return example, example["targets"]

        dataset = dataset.map(prepare_for_output,
                              num_parallel_calls=num_threads)
        dataset = dataset.prefetch(2)

        if mode == tf.estimator.ModeKeys.PREDICT:
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner. DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                                 data_reader.DummyQueueRunner())

        return dataset
예제 #6
0
    def input_fn():
        """Supplies input to our model.

    This function supplies input to our model, where this input is a
    function of the mode. For example, we supply different data if
    we're performing training versus evaluation.

    Returns:
      A tuple consisting of 1) a dictionary of tensors whose keys are
      the feature names, and 2) a tensor of target labels if the mode
      is not INFER (and None, otherwise).

    Raises:
      ValueError: if one of the parameters has an unsupported value.
    """
        problem_count, batches = len(hparams.problems), []
        with tf.name_scope("input_reader"):
            for n in xrange(problem_count):
                if fixed_problem is not None and n != fixed_problem:
                    continue
                problem_instance = hparams.problem_instances[n]
                p_hparams = hparams.problems[n]
                with tf.name_scope("problem_%d" % n):
                    with tf.device("/cpu:0"):  # Input reading on CPU
                        capacity = (
                            p_hparams.max_expected_batch_size_per_shard *
                            num_datashards)
                        feature_map = data_reader.input_pipeline(
                            problem_instance, data_file_patterns
                            and data_file_patterns[n], capacity, mode, hparams,
                            data_reader.hparams_to_batching_scheme(
                                hparams,
                                shard_multiplier=num_datashards,
                                drop_long_sequences=(
                                    mode == tf.estimator.ModeKeys.TRAIN
                                    or hparams.eval_drop_long_sequences),
                                length_multiplier=(
                                    p_hparams.batch_size_multiplier)))

                # Reverse inputs and targets features if the problem was reversed.
                if problem_instance is not None:
                    problem_instance.maybe_reverse_features(feature_map)
                    problem_instance.maybe_copy_features(feature_map)
                else:
                    if p_hparams.was_reversed:
                        inputs = feature_map["inputs"]
                        targets = feature_map["targets"]
                        feature_map["inputs"] = targets
                        feature_map["targets"] = inputs
                    # Use the inputs as the targets if the problem is a copy problem.
                    if p_hparams.was_copy:
                        feature_map["targets"] = feature_map["inputs"]

                # Ensure inputs and targets are proper rank.
                while len(feature_map["inputs"].get_shape()) != 4:
                    feature_map["inputs"] = tf.expand_dims(
                        feature_map["inputs"], axis=-1)
                while len(feature_map["targets"].get_shape()) != 4:
                    feature_map["targets"] = tf.expand_dims(
                        feature_map["targets"], axis=-1)

                batches.append(
                    (feature_map["inputs"], feature_map["targets"],
                     tf.constant(n), tf.constant(p_hparams.input_space_id),
                     tf.constant(p_hparams.target_space_id)))

        # We choose which problem to process.
        loss_moving_avgs = []  # Need loss moving averages for that.
        for n in xrange(problem_count):
            with tf.variable_scope("losses_avg"):
                loss_moving_avgs.append(
                    tf.get_variable("problem_%d/total_loss" % n,
                                    initializer=100.0,
                                    trainable=False))
        if fixed_problem is None:
            if (hparams.problem_choice == "uniform"
                    or mode != tf.estimator.ModeKeys.TRAIN):
                problem_choice = tf.random_uniform([],
                                                   maxval=problem_count,
                                                   dtype=tf.int32)
            elif hparams.problem_choice == "adaptive":
                loss_moving_avgs = tf.stack(loss_moving_avgs)
                problem_choice = tf.multinomial(
                    tf.reshape(loss_moving_avgs, [1, -1]), 1)
                problem_choice = tf.to_int32(tf.squeeze(problem_choice))
            elif hparams.problem_choice == "distributed":
                assert worker_replicas >= problem_count
                assert worker_replicas % problem_count == 0
                problem_choice = tf.to_int32(worker_id % problem_count)
            else:
                raise ValueError(
                    "Value of hparams.problem_choice is %s and must be "
                    "one of [uniform, adaptive, distributed]" %
                    hparams.problem_choice)

            # Inputs and targets conditional on problem_choice.
            rand_inputs, rand_target, choice, inp_id, tgt_id = cond_on_index(
                lambda n: batches[n], problem_choice, 0, problem_count - 1)
        else:
            problem_choice = tf.constant(fixed_problem)
            # Take the only constructed batch, which is the fixed_problem.
            rand_inputs, rand_target, choice, inp_id, tgt_id = batches[0]

        # Set shapes so the ranks are clear.
        rand_inputs.set_shape([None, None, None, None])
        rand_target.set_shape([None, None, None, None])
        choice.set_shape([])
        inp_id.set_shape([])
        tgt_id.set_shape([])
        #  Forced shape obfuscation is necessary for inference.
        if mode == tf.estimator.ModeKeys.PREDICT:
            rand_inputs._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access
            rand_target._shape = tf.TensorShape([None, None, None, None])  # pylint: disable=protected-access

        # Final feature map.
        rand_feature_map = {
            "inputs": rand_inputs,
            "problem_choice": choice,
            "input_space_id": inp_id,
            "target_space_id": tgt_id
        }
        if mode == tf.estimator.ModeKeys.PREDICT:
            rand_feature_map["infer_targets"] = rand_target
            rand_target = None
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner.  DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                                 DummyQueueRunner())

        return rand_feature_map, rand_target
예제 #7
0
    def input_fn(self,
                 mode,
                 hparams,
                 data_dir=None,
                 params=None,
                 config=None,
                 dataset_kwargs=None):
        """Builds input pipeline for problem.

    Args:
      mode: tf.estimator.ModeKeys
      hparams: HParams, model hparams
      data_dir: str, data directory; if None, will use hparams.data_dir
      params: dict, may include "batch_size"
      config: RunConfig; should have the data_parallelism attribute if not using
        TPU
      dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
        method when called

    Returns:
      (features_dict<str name, Tensor feature>, Tensor targets)
    """
        is_training = mode == tf.estimator.ModeKeys.TRAIN
        num_threads = 4 if is_training else 1

        def tpu_valid_size(example):
            return data_reader.example_valid_size(example, hparams.min_length,
                                                  hparams.max_length)

        def gpu_valid_size(example):
            drop_long_sequences = is_training or hparams.eval_drop_long_sequences
            return data_reader.example_valid_size(
                example, hparams.min_length,
                hparams.max_length if drop_long_sequences else 10**9)

        def define_shapes(example):
            batch_size = config and config.use_tpu and params["batch_size"]
            return standardize_shapes(example, batch_size=batch_size)

        # Read and preprocess
        data_dir = data_dir or hparams.data_dir

        dataset_kwargs = dataset_kwargs or {}
        dataset_kwargs.update({
            "mode": mode,
            "data_dir": data_dir,
            "num_threads": num_threads,
            "hparams": hparams
        })

        dataset = self.dataset(**dataset_kwargs)
        dataset = dataset.map(data_reader.cast_int64_to_int32,
                              num_parallel_calls=num_threads)
        if is_training:
            dataset = dataset.repeat(None)

        # Batching
        if _are_shapes_fully_defined(dataset.output_shapes):
            # Static shape features (e.g. images)
            if config and config.use_tpu:
                tpu_batch_size = params["batch_size"]
                dataset = dataset.apply(
                    tf.contrib.data.batch_and_drop_remainder(tpu_batch_size))
            else:
                num_shards = (config and config.data_parallelism.n) or 1
                dataset = dataset.batch(hparams.batch_size * num_shards)
        else:
            # Variable length features
            if config and config.use_tpu:
                # On TPU, pad to hparams.max_length
                dataset = dataset.filter(tpu_valid_size)
                padded_shapes = _fill_shape_nones(
                    dataset.output_shapes, none_filler=hparams.max_length)
                dataset = dataset.apply(
                    tf.contrib.data.padded_batch_and_drop_remainder(
                        params["batch_size"], padded_shapes))
            else:
                # On GPU, bucket by length
                dataset = dataset.filter(gpu_valid_size)
                batching_scheme = data_reader.hparams_to_batching_scheme(
                    hparams,
                    shard_multiplier=(config and config.data_parallelism.n)
                    or 1,
                    length_multiplier=self.get_hparams().batch_size_multiplier)
                if hparams.use_fixed_batch_size:
                    batching_scheme["batch_sizes"] = [hparams.batch_size]
                    batching_scheme["boundaries"] = []
                dataset = data_reader.bucket_by_sequence_length(
                    dataset, data_reader.example_length,
                    batching_scheme["boundaries"],
                    batching_scheme["batch_sizes"])

        dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)
        dataset = dataset.prefetch(1)
        features = dataset.make_one_shot_iterator().get_next()
        if not config or not config.use_tpu:
            _summarize_features(features,
                                (config and config.data_parallelism.n) or 1)

        if mode == tf.estimator.ModeKeys.PREDICT:
            features["infer_targets"] = features["targets"]
            features["targets"] = None
            # This is because of a bug in the Estimator that short-circuits prediction
            # if it doesn't see a QueueRunner. DummyQueueRunner implements the
            # minimal expected interface but does nothing.
            tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                                 data_reader.DummyQueueRunner())

        return features, features["targets"]
예제 #8
0
  def input_fn(self,
               mode,
               hparams,
               data_dir=None,
               params=None,
               config=None,
               dataset_kwargs=None):
    """Builds input pipeline for problem.

    Args:
      mode: tf.estimator.ModeKeys
      hparams: HParams, model hparams
      data_dir: str, data directory; if None, will use hparams.data_dir
      params: dict, may include "batch_size"
      config: RunConfig; should have the data_parallelism attribute if not using
        TPU
      dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
        method when called

    Returns:
      (features_dict<str name, Tensor feature>, Tensor targets)
    """
    partition_id, num_partitions = self._dataset_partition(mode, config)

    is_training = mode == tf.estimator.ModeKeys.TRAIN
    if config and config.use_tpu:
      num_threads = 64
    else:
      num_threads = 4 if is_training else 1

    max_length = self.max_length(hparams)

    def tpu_valid_size(example):
      return data_reader.example_valid_size(example, hparams.min_length,
                                            max_length)

    def gpu_valid_size(example):
      drop_long_sequences = is_training or hparams.eval_drop_long_sequences
      return data_reader.example_valid_size(example, hparams.min_length,
                                            max_length
                                            if drop_long_sequences else 10**9)

    def define_shapes(example):
      batch_size = config and config.use_tpu and params["batch_size"]
      return standardize_shapes(example, batch_size=batch_size)

    # Read and preprocess
    data_dir = data_dir or (hasattr(hparams, "data_dir") and hparams.data_dir)

    dataset_kwargs = dataset_kwargs or {}
    dataset_kwargs.update({
        "mode": mode,
        "data_dir": data_dir,
        "num_threads": num_threads,
        "hparams": hparams,
        "partition_id": partition_id,
        "num_partitions": num_partitions,
    })

    dataset = self.dataset(**dataset_kwargs)
    if is_training:
      # Repeat and skip a random number of records
      dataset = dataset.repeat()
      data_files = tf.contrib.slim.parallel_reader.get_data_files(
          self.filepattern(data_dir, mode))
      #  In continuous_train_and_eval when switching between train and
      #  eval, this input_fn method gets called multiple times and it
      #  would give you the exact same samples from the last call
      #  (because the Graph seed is set). So this skip gives you some
      #  shuffling.
      dataset = skip_random_fraction(dataset, data_files[0])

    dataset = dataset.map(
        data_reader.cast_ints_to_int32, num_parallel_calls=num_threads)

    if self.batch_size_means_tokens:
      batch_size_means_tokens = True
    else:
      if _are_shapes_fully_defined(dataset.output_shapes):
        batch_size_means_tokens = False
      else:
        tf.logging.warning(
            "Shapes are not fully defined. Assuming batch_size means tokens.")
        batch_size_means_tokens = True

    # Batching
    if not batch_size_means_tokens:
      # Batch size means examples per datashard.
      if config and config.use_tpu:
        # on TPU, we use params["batch_size"], which specifies the number of
        # examples across all datashards
        batch_size = params["batch_size"]
        dataset = dataset.batch(batch_size, drop_remainder=True)
      else:
        num_shards = config.data_parallelism.n if config else 1
        batch_size = hparams.batch_size * num_shards
        dataset = dataset.batch(batch_size)
    else:
      # batch_size means tokens per datashard
      if config and config.use_tpu:
        dataset = dataset.filter(tpu_valid_size)
        padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams)
        # on TPU, we use params["batch_size"], which specifies the number of
        # examples across all datashards
        batch_size = params["batch_size"]
        dataset = dataset.apply(
            tf.contrib.data.padded_batch_and_drop_remainder(
                batch_size, padded_shapes))
      else:
        # On GPU, bucket by length
        dataset = dataset.filter(gpu_valid_size)
        shard_multiplier = config.data_parallelism.n if config else 1
        batching_scheme = data_reader.hparams_to_batching_scheme(
            hparams,
            shard_multiplier=shard_multiplier,
            length_multiplier=self.get_hparams().batch_size_multiplier)
        if hparams.use_fixed_batch_size:
          # Here  batch_size really means examples per datashard.
          batching_scheme["batch_sizes"] = [hparams.batch_size]
          batching_scheme["boundaries"] = []
        dataset = data_reader.bucket_by_sequence_length(
            dataset, data_reader.example_length, batching_scheme["boundaries"],
            batching_scheme["batch_sizes"])

        if not is_training:
          batch_multiple = shard_multiplier
          if hparams.use_fixed_batch_size:
            # Make sure the last batch has the same fixed size as the rest.
            batch_multiple *= hparams.batch_size
          if batch_multiple > 1:
            tf.logging.warn(
                "Padding the batch to ensure that remainder eval batches have "
                "a batch size divisible by the number of data shards. This may "
                "lead to incorrect metrics for non-zero-padded features, e.g. "
                "images. Use a single datashard (i.e. 1 GPU) in that case.")
            dataset = dataset.map(
                functools.partial(pad_batch, batch_multiple=batch_multiple),
                num_parallel_calls=num_threads)

    dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)

    def prepare_for_output(example):
      if not config or not config.use_tpu:
        _summarize_features(example,
                            (config and config.data_parallelism.n) or 1)
      if mode == tf.estimator.ModeKeys.PREDICT:
        example["infer_targets"] = example.pop("targets")
        return example
      else:
        return example, example["targets"]

    dataset = dataset.map(prepare_for_output, num_parallel_calls=num_threads)
    dataset = dataset.prefetch(2)

    if mode == tf.estimator.ModeKeys.PREDICT:
      # This is because of a bug in the Estimator that short-circuits prediction
      # if it doesn't see a QueueRunner. DummyQueueRunner implements the
      # minimal expected interface but does nothing.
      tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                           data_reader.DummyQueueRunner())

    return dataset
예제 #9
0
  def input_fn(self, mode, hparams, data_dir=None, params=None, config=None,
               dataset_kwargs=None):
    """Builds input pipeline for problem.

    Args:
      mode: tf.estimator.ModeKeys
      hparams: HParams, model hparams
      data_dir: str, data directory; if None, will use hparams.data_dir
      params: dict, may include "batch_size"
      config: RunConfig; should have the data_parallelism attribute if not using
        TPU
      dataset_kwargs: dict, if passed, will pass as kwargs to self.dataset
        method when called

    Returns:
      (features_dict<str name, Tensor feature>, Tensor targets)
    """
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    num_threads = 4 if is_training else 1

    def tpu_valid_size(example):
      return data_reader.example_valid_size(example, hparams.min_length,
                                            hparams.max_length)

    def gpu_valid_size(example):
      drop_long_sequences = is_training or hparams.eval_drop_long_sequences
      return data_reader.example_valid_size(
          example,
          hparams.min_length,
          hparams.max_length if drop_long_sequences else 10**9)

    def define_shapes(example):
      batch_size = config and config.use_tpu and params["batch_size"]
      return standardize_shapes(example, batch_size=batch_size)

    # Read and preprocess
    data_dir = data_dir or hparams.data_dir

    dataset_kwargs = dataset_kwargs or {}
    dataset_kwargs.update({
        "mode": mode,
        "data_dir": data_dir,
        "num_threads": num_threads,
        "hparams": hparams})

    dataset = self.dataset(**dataset_kwargs)
    dataset = dataset.map(
        data_reader.cast_int64_to_int32, num_parallel_calls=num_threads)
    if is_training:
      dataset = dataset.repeat(None)

    # Batching
    if _are_shapes_fully_defined(dataset.output_shapes):
      # Static shape features (e.g. images)
      if config and config.use_tpu:
        tpu_batch_size = params["batch_size"]
        dataset = dataset.apply(
            tf.contrib.data.batch_and_drop_remainder(tpu_batch_size))
      else:
        num_shards = (config and config.data_parallelism.n) or 1
        dataset = dataset.batch(hparams.batch_size * num_shards)
    else:
      # Variable length features
      if config and config.use_tpu:
        # On TPU, pad to hparams.max_length
        dataset = dataset.filter(tpu_valid_size)
        padded_shapes = _fill_shape_nones(
            dataset.output_shapes, none_filler=hparams.max_length)
        dataset = dataset.apply(
            tf.contrib.data.padded_batch_and_drop_remainder(
                params["batch_size"], padded_shapes))
      else:
        # On GPU, bucket by length
        dataset = dataset.filter(gpu_valid_size)
        batching_scheme = data_reader.hparams_to_batching_scheme(
            hparams,
            shard_multiplier=(config and config.data_parallelism.n) or 1,
            length_multiplier=self.get_hparams().batch_size_multiplier)
        if hparams.use_fixed_batch_size:
          batching_scheme["batch_sizes"] = [hparams.batch_size]
          batching_scheme["boundaries"] = []
        dataset = data_reader.bucket_by_sequence_length(
            dataset,
            data_reader.example_length,
            batching_scheme["boundaries"],
            batching_scheme["batch_sizes"])

        if not is_training:
          def _pad_batch(features):
            if not config or config.data_parallelism.n <= 1:
              return features
            tf.logging.warn(
                "Padding the batch to ensure that remainder eval batches have "
                "a batch size divisible by the number of data shards. This may "
                "lead to incorrect metrics for non-zero-padded features, e.g. "
                "images. Use a single datashard (i.e. 1 GPU) in that case.")
            return pad_batch(features, config.data_parallelism.n)

          dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads)

    dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)
    dataset = dataset.prefetch(1)
    features = dataset.make_one_shot_iterator().get_next()
    if not config or not config.use_tpu:
      _summarize_features(features, (config and config.data_parallelism.n) or 1)

    if mode == tf.estimator.ModeKeys.PREDICT:
      features["infer_targets"] = features["targets"]
      features["targets"] = None
      # This is because of a bug in the Estimator that short-circuits prediction
      # if it doesn't see a QueueRunner. DummyQueueRunner implements the
      # minimal expected interface but does nothing.
      tf.add_to_collection(tf.GraphKeys.QUEUE_RUNNERS,
                           data_reader.DummyQueueRunner())

    return features, features["targets"]