Пример #1
0
  def _input_fn_impl(self,
                     mode,
                     batch_size,
                     prefetch_buffer_size,
                     num_parallel_process_calls,
                     metadata,
                     features_file,
                     labels_file=None,
                     num_buckets=None,
                     sample_buffer_size=None,
                     maximum_features_length=None,
                     maximum_labels_length=None):
    """See ``input_fn``."""
    self._initialize(metadata)

    feat_dataset, feat_process_fn, feat_padded_shapes_fn = self._get_features_builder(features_file)

    if labels_file is None:
      dataset = feat_dataset
      # Parallel inputs must be catched in a single tuple and not considered as multiple arguments.
      process_fn = lambda *arg: feat_process_fn(item_or_tuple(arg))
      padded_shapes_fn = feat_padded_shapes_fn
    else:
      labels_dataset, labels_process_fn, labels_padded_shapes_fn = (
          self._get_labels_builder(labels_file))

      dataset = tf.data.Dataset.zip((feat_dataset, labels_dataset))
      process_fn = lambda features, labels: (
          feat_process_fn(features), labels_process_fn(labels))
      padded_shapes_fn = lambda: (
          feat_padded_shapes_fn(), labels_padded_shapes_fn())

    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = dataset.shuffle(sample_buffer_size, seed=int(time.time()))

    dataset = dataset.map(
        process_fn,
        num_parallel_calls=num_parallel_process_calls).prefetch(prefetch_buffer_size)
    padded_shapes = padded_shapes_fn()

    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = dataset.filter(lambda features, labels: self._filter_example(
          features,
          labels,
          maximum_features_length=maximum_features_length,
          maximum_labels_length=maximum_labels_length))

    num_buckets = num_buckets or 1

    if mode == tf.estimator.ModeKeys.TRAIN and num_buckets > 1:
      # Bucketize training dataset to improve efficiency.
      def _key_func(features, labels):
        length = None

        if length is None:
          length = self._get_features_length(features)
          maximum_length = maximum_features_length
          # For multi inputs, apply bucketing on the target side or none at all.
          if isinstance(length, list):
            length = None
            maximum_length = None
        if length is None:
          length = self._get_labels_length(labels)
          maximum_length = maximum_labels_length
        if length is None:
          return tf.constant(0, dtype=tf.int64)

        if maximum_length is not None:
          bucket_width = (maximum_length + num_buckets - 1) // num_buckets
        else:
          bucket_width = 10

        bucket_id = length // bucket_width
        bucket_id = tf.minimum(bucket_id, num_buckets)
        return tf.to_int64(bucket_id)

      def _reduce_func(unused_key, dataset):
        return dataset.padded_batch(
            batch_size,
            padded_shapes=padded_shapes)

      dataset = dataset.apply(tf.contrib.data.group_by_window(
          _key_func,
          _reduce_func,
          window_size=batch_size))
    else:
      dataset = dataset.padded_batch(
          batch_size,
          padded_shapes=padded_shapes)

    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = dataset.repeat()

    iterator = dataset.make_initializable_iterator()

    # Add the initializer to a standard collection for it to be initialized.
    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer)

    return iterator.get_next()
Пример #2
0
  def _input_fn_impl(self,
                     mode,
                     batch_size,
                     metadata,
                     features_file,
                     labels_file=None,
                     batch_type="examples",
                     batch_multiplier=1,
                     bucket_width=None,
                     single_pass=False,
                     num_threads=None,
                     sample_buffer_size=None,
                     maximum_features_length=None,
                     maximum_labels_length=None):
    """See ``input_fn``."""
    self._initialize(metadata)

    feat_dataset, feat_process_fn, feat_padded_shapes_fn = self._get_features_builder(features_file)

    if labels_file is None:
      dataset = feat_dataset
      # Parallel inputs must be catched in a single tuple and not considered as multiple arguments.
      process_fn = lambda *arg: feat_process_fn(item_or_tuple(arg))
      padded_shapes_fn = feat_padded_shapes_fn
    else:
      labels_dataset, labels_process_fn, labels_padded_shapes_fn = (
          self._get_labels_builder(labels_file))

      dataset = tf.data.Dataset.zip((feat_dataset, labels_dataset))
      process_fn = lambda features, labels: (
          feat_process_fn(features), labels_process_fn(labels))
      padded_shapes_fn = lambda: (
          feat_padded_shapes_fn(), labels_padded_shapes_fn())

    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = dataset.shuffle(sample_buffer_size)
      dataset = dataset.map(
          process_fn,
          num_parallel_calls=num_threads or 4)
      dataset = dataset.apply(data.filter_examples_by_length(
          maximum_features_length=maximum_features_length,
          maximum_labels_length=maximum_labels_length,
          features_length_fn=self._get_features_length,
          labels_length_fn=self._get_labels_length))
      dataset = dataset.apply(data.batch_train_dataset(
          batch_size,
          batch_type=batch_type,
          batch_multiplier=batch_multiplier,
          bucket_width=bucket_width,
          padded_shapes=padded_shapes_fn(),
          features_length_fn=self._get_features_length,
          labels_length_fn=self._get_labels_length))
      dataset = dataset.apply(data.filter_irregular_batches(batch_multiplier))
      if not single_pass:
        dataset = dataset.repeat()
    else:
      dataset = dataset.map(
          process_fn,
          num_parallel_calls=num_threads or 1)
      dataset = dataset.padded_batch(
          batch_size,
          padded_shapes=padded_shapes_fn())

    dataset = dataset.prefetch(1)

    iterator = dataset.make_initializable_iterator()

    # Add the initializer to a standard collection for it to be initialized.
    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer)

    return iterator.get_next()
Пример #3
0
  def _input_fn_impl(self,
                     mode,
                     batch_size,
                     metadata,
                     features_file,
                     labels_file=None,
                     batch_type="examples",
                     batch_multiplier=1,
                     bucket_width=None,
                     single_pass=False,
                     num_threads=None,
                     sample_buffer_size=None,
                     prefetch_buffer_size=None,
                     maximum_features_length=None,
                     maximum_labels_length=None):
    """See ``input_fn``."""
    self._initialize(metadata)

    feat_dataset, feat_process_fn = self._get_features_builder(features_file)

    if labels_file is None:
      dataset = feat_dataset
      # Parallel inputs must be catched in a single tuple and not considered as multiple arguments.
      process_fn = lambda *arg: feat_process_fn(item_or_tuple(arg))
    else:
      labels_dataset, labels_process_fn = self._get_labels_builder(labels_file)

      dataset = tf.data.Dataset.zip((feat_dataset, labels_dataset))
      process_fn = lambda features, labels: (
          feat_process_fn(features), labels_process_fn(labels))
      dataset, process_fn = self._augment_parallel_dataset(dataset, process_fn, mode=mode)

    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset = data.training_pipeline(
          dataset,
          batch_size,
          batch_type=batch_type,
          batch_multiplier=batch_multiplier,
          bucket_width=bucket_width,
          single_pass=single_pass,
          process_fn=process_fn,
          num_threads=num_threads,
          shuffle_buffer_size=sample_buffer_size,
          prefetch_buffer_size=prefetch_buffer_size,
          dataset_size=self._get_dataset_size(features_file),
          maximum_features_length=maximum_features_length,
          maximum_labels_length=maximum_labels_length,
          features_length_fn=self._get_features_length,
          labels_length_fn=self._get_labels_length)
    else:
      dataset = data.inference_pipeline(
          dataset,
          batch_size,
          process_fn=process_fn,
          num_threads=num_threads,
          prefetch_buffer_size=prefetch_buffer_size)

    iterator = dataset.make_initializable_iterator()

    # Add the initializer to a standard collection for it to be initialized.
    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer)

    return iterator.get_next()
Пример #4
0
    def make_training_dataset(
        self,
        features_file,
        labels_file,
        batch_size,
        batch_type="examples",
        batch_multiplier=1,
        batch_size_multiple=1,
        shuffle_buffer_size=None,
        length_bucket_width=None,
        maximum_features_length=None,
        maximum_labels_length=None,
        single_pass=False,
        num_shards=1,
        shard_index=0,
        num_threads=4,
        prefetch_buffer_size=None,
        cardinality_multiple=1,
        weights=None,
        batch_autotune_mode=False,
    ):
        """Builds a dataset to be used for training. It supports the full training
        pipeline, including:

        * sharding
        * shuffling
        * filtering
        * bucketing
        * prefetching

        Args:
          features_file: The source file or a list of training source files.
          labels_file: The target file or a list of training target files.
          batch_size: The batch size to use.
          batch_type: The training batching strategy to use: can be "examples" or
            "tokens".
          batch_multiplier: The batch size multiplier to prepare splitting accross
             replicated graph parts.
          batch_size_multiple: When :obj:`batch_type` is "tokens", ensure that the
            resulting batch size is a multiple of this value.
          shuffle_buffer_size: The number of elements from which to sample.
          length_bucket_width: The width of the length buckets to select batch
            candidates from (for efficiency). Set ``None`` to not constrain batch
            formation.
          maximum_features_length: The maximum length or list of maximum lengths of
            the features sequence(s). ``None`` to not constrain the length.
          maximum_labels_length: The maximum length of the labels sequence.
            ``None`` to not constrain the length.
          single_pass: If ``True``, makes a single pass over the training data.
          num_shards: The number of data shards (usually the number of workers in a
            distributed setting).
          shard_index: The shard index this data pipeline should read from.
          num_threads: The number of elements processed in parallel.
          prefetch_buffer_size: The number of batches to prefetch asynchronously. If
            ``None``, use an automatically tuned value.
          cardinality_multiple: Ensure that the dataset cardinality is a multiple of
            this value when :obj:`single_pass` is ``True``.
          weights: An optional list of weights to create a weighted dataset out of
            multiple training files.
          batch_autotune_mode: When enabled, all batches are padded to the maximum
            sequence length.

        Returns:
          A ``tf.data.Dataset``.

        See Also:
          :func:`opennmt.data.training_pipeline`
        """
        if labels_file is not None:
            data_files = [features_file, labels_file]
            maximum_length = [maximum_features_length, maximum_labels_length]
            features_length_fn = self.features_inputter.get_length
            labels_length_fn = self.labels_inputter.get_length
        else:
            data_files = features_file
            maximum_length = maximum_features_length
            features_length_fn = self.get_length
            labels_length_fn = None

        dataset = self.make_dataset(data_files, training=True)

        map_fn = lambda *arg: self.make_features(
            element=misc.item_or_tuple(arg), training=True)
        filter_fn = lambda *arg: (self.keep_for_training(
            misc.item_or_tuple(arg), maximum_length=maximum_length))
        transform_fns = [
            lambda dataset: dataset.map(map_fn,
                                        num_parallel_calls=num_threads or 4),
            lambda dataset: dataset.filter(filter_fn),
        ]

        if batch_autotune_mode:
            # In this mode we want to return batches where all sequences are padded
            # to the maximum possible length in order to maximize the memory usage.
            # Shuffling, sharding, prefetching, etc. are not applied since correctness and
            # performance are not important.

            if isinstance(dataset, list):  # Ignore weighted dataset.
                dataset = dataset[0]

            # We repeat the dataset now to ensure full batches are always returned.
            dataset = dataset.repeat()
            for transform_fn in transform_fns:
                dataset = dataset.apply(transform_fn)

            # length_fn returns the maximum length instead of the actual example length so
            # that batches are built as if each example has the maximum length.
            if labels_file is not None:
                constant_length_fn = [
                    lambda x: maximum_features_length,
                    lambda x: maximum_labels_length,
                ]
            else:
                constant_length_fn = lambda x: maximum_features_length

            # The length dimension is set to the maximum length in the padded shapes.
            padded_shapes = self.get_padded_shapes(
                dataset.element_spec, maximum_length=maximum_length)
            dataset = dataset.apply(
                dataset_util.batch_sequence_dataset(
                    batch_size,
                    batch_type=batch_type,
                    batch_multiplier=batch_multiplier,
                    length_bucket_width=1,
                    length_fn=constant_length_fn,
                    padded_shapes=padded_shapes,
                ))
            return dataset

        if weights is not None:
            dataset = (dataset, weights)
        dataset = dataset_util.training_pipeline(
            batch_size,
            batch_type=batch_type,
            batch_multiplier=batch_multiplier,
            batch_size_multiple=batch_size_multiple,
            transform_fns=transform_fns,
            length_bucket_width=length_bucket_width,
            features_length_fn=features_length_fn,
            labels_length_fn=labels_length_fn,
            single_pass=single_pass,
            num_shards=num_shards,
            shard_index=shard_index,
            num_threads=num_threads,
            dataset_size=self.get_dataset_size(data_files),
            shuffle_buffer_size=shuffle_buffer_size,
            prefetch_buffer_size=prefetch_buffer_size,
            cardinality_multiple=cardinality_multiple,
        )(dataset)
        return dataset
Пример #5
0
  def _input_fn_impl(self,
                     mode,
                     batch_size,
                     metadata,
                     features_file,
                     labels_file=None,
                     batch_type="examples",
                     batch_multiplier=1,
                     bucket_width=None,
                     single_pass=False,
                     num_threads=None,
                     sample_buffer_size=None,
                     prefetch_buffer_size=None,
                     maximum_features_length=None,
                     maximum_labels_length=None):
    """See ``input_fn``."""
    self._initialize(metadata)

    feat_dataset, feat_process_fn = self._get_features_builder(features_file)

    if labels_file is None:
      dataset = feat_dataset
      # Parallel inputs must be catched in a single tuple and not considered as multiple arguments.
      process_fn = lambda *arg: feat_process_fn(item_or_tuple(arg))
    else:
      labels_dataset, labels_process_fn = self._get_labels_builder(labels_file)

      dataset = tf.data.Dataset.zip((feat_dataset, labels_dataset))
      process_fn = lambda features, labels: (
          feat_process_fn(features), labels_process_fn(labels))

    if mode == tf.estimator.ModeKeys.TRAIN:
      dataset_size = self._get_dataset_size(features_file)
      if sample_buffer_size < dataset_size:
        # When the sample buffer size is smaller than the dataset size, shard
        # the dataset in a random order. This ensures that all parts of the
        # dataset can be seen when the evaluation frequency is high.
        dataset = dataset.apply(data.random_shard(sample_buffer_size, dataset_size))
      dataset = dataset.shuffle(sample_buffer_size)
      dataset = dataset.map(
          process_fn,
          num_parallel_calls=num_threads or 4)
      dataset = dataset.apply(data.filter_examples_by_length(
          maximum_features_length=maximum_features_length,
          maximum_labels_length=maximum_labels_length,
          features_length_fn=self._get_features_length,
          labels_length_fn=self._get_labels_length))
      dataset = dataset.apply(data.batch_parallel_dataset(
          batch_size,
          batch_type=batch_type,
          batch_multiplier=batch_multiplier,
          bucket_width=bucket_width,
          features_length_fn=self._get_features_length,
          labels_length_fn=self._get_labels_length))
      dataset = dataset.apply(data.filter_irregular_batches(batch_multiplier))
      if not single_pass:
        dataset = dataset.repeat()
    else:
      dataset = dataset.map(
          process_fn,
          num_parallel_calls=num_threads or 1)
      dataset = dataset.apply(data.batch_parallel_dataset(batch_size))

    if prefetch_buffer_size:
      dataset = dataset.prefetch(prefetch_buffer_size)

    return dataset
Пример #6
0
    def make_training_dataset(self,
                              features_file,
                              labels_file,
                              batch_size,
                              batch_type="examples",
                              batch_multiplier=1,
                              batch_size_multiple=1,
                              shuffle_buffer_size=None,
                              length_bucket_width=None,
                              maximum_features_length=None,
                              maximum_labels_length=None,
                              single_pass=False,
                              num_shards=1,
                              shard_index=0,
                              num_threads=4,
                              prefetch_buffer_size=None,
                              cardinality_multiple=1,
                              weights=None):
        """Builds a dataset to be used for training. It supports the full training
    pipeline, including:

    * sharding
    * shuffling
    * filtering
    * bucketing
    * prefetching

    Args:
      features_file: The source file or a list of training source files.
      labels_file: The target file or a list of training target files.
      batch_size: The batch size to use.
      batch_type: The training batching strategy to use: can be "examples" or
        "tokens".
      batch_multiplier: The batch size multiplier to prepare splitting accross
         replicated graph parts.
      batch_size_multiple: When :obj:`batch_type` is "tokens", ensure that the
        resulting batch size is a multiple of this value.
      shuffle_buffer_size: The number of elements from which to sample.
      length_bucket_width: The width of the length buckets to select batch
        candidates from (for efficiency). Set ``None`` to not constrain batch
        formation.
      maximum_features_length: The maximum length or list of maximum lengths of
        the features sequence(s). ``None`` to not constrain the length.
      maximum_labels_length: The maximum length of the labels sequence.
        ``None`` to not constrain the length.
      single_pass: If ``True``, makes a single pass over the training data.
      num_shards: The number of data shards (usually the number of workers in a
        distributed setting).
      shard_index: The shard index this data pipeline should read from.
      num_threads: The number of elements processed in parallel.
      prefetch_buffer_size: The number of batches to prefetch asynchronously. If
        ``None``, use an automatically tuned value.
      cardinality_multiple: Ensure that the dataset cardinality is a multiple of
        this value when :obj:`single_pass` is ``True``.
      weights: An optional list of weights to create a weighted dataset out of
        multiple training files.

    Returns:
      A ``tf.data.Dataset``.

    See Also:
      :func:`opennmt.data.training_pipeline`
    """
        if labels_file is not None:
            data_files = [features_file, labels_file]
            maximum_length = [maximum_features_length, maximum_labels_length]
            features_length_fn = self.features_inputter.get_length
            labels_length_fn = self.labels_inputter.get_length
        else:
            data_files = features_file
            maximum_length = maximum_features_length
            features_length_fn = self.get_length
            labels_length_fn = None

        map_fn = lambda *arg: self.make_features(
            element=misc.item_or_tuple(arg), training=True)
        filter_fn = lambda *arg: (self.keep_for_training(
            misc.item_or_tuple(arg), maximum_length=maximum_length))
        transform_fns = [
            lambda dataset: dataset.map(map_fn,
                                        num_parallel_calls=num_threads or 4),
            lambda dataset: dataset.filter(filter_fn)
        ]

        dataset = self.make_dataset(data_files, training=True)
        if weights is not None:
            dataset = (dataset, weights)
        dataset = dataset_util.training_pipeline(
            batch_size,
            batch_type=batch_type,
            batch_multiplier=batch_multiplier,
            batch_size_multiple=batch_size_multiple,
            transform_fns=transform_fns,
            length_bucket_width=length_bucket_width,
            features_length_fn=features_length_fn,
            labels_length_fn=labels_length_fn,
            single_pass=single_pass,
            num_shards=num_shards,
            shard_index=shard_index,
            num_threads=num_threads,
            dataset_size=self.get_dataset_size(data_files),
            shuffle_buffer_size=shuffle_buffer_size,
            prefetch_buffer_size=prefetch_buffer_size,
            cardinality_multiple=cardinality_multiple)(dataset)
        return dataset
Пример #7
0
 def _map_fn(*arg):
   features = self.make_features(element=misc.item_or_tuple(arg), training=False)
   if isinstance(features, (list, tuple)):
     # Special case for unsupervised inputters that always return a tuple (features, labels).
     return features[0]
   return features