示例#1
0
  def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
    """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
    if self._num_gpus > 1:
      dataset = dataset.shard(self._num_gpus, hvd.rank())

    if self.is_training:
      # Shuffle the input files.
      dataset.shuffle(buffer_size=self._file_shuffle_buffer_size)

    if self.is_training and not self._cache:
      dataset = dataset.repeat()

    # Read the data from disk in parallel
    dataset = dataset.interleave(
        tf.data.TFRecordDataset,
        cycle_length=10,
        block_length=1,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if self._cache:
      dataset = dataset.cache()

    if self.is_training:
      dataset = dataset.shuffle(self._shuffle_buffer_size)
      dataset = dataset.repeat()

    # Parse, pre-process, and batch the data in parallel
    preprocess = self.parse_record
    dataset = dataset.map(preprocess,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if self._num_gpus > 1:
      # The batch size of the dataset will be multiplied by the number of
      # replicas automatically when strategy.distribute_datasets_from_function
      # is called, so we use local batch size here.
      dataset = dataset.batch(self.local_batch_size,
                              drop_remainder=self.is_training)
    else:
      dataset = dataset.batch(self.global_batch_size,
                              drop_remainder=self.is_training)

    # Apply Mixup
    mixup_alpha = self.mixup_alpha if self.is_training else 0.0
    dataset = dataset.map(
        functools.partial(self.mixup, self.local_batch_size, mixup_alpha),
        num_parallel_calls=64)

    # Prefetch overlaps in-feed with training
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
示例#2
0
 def _maybe_apply_data_service(
     self,
     dataset: tf.data.Dataset,
     input_context: Optional[tf.distribute.InputContext] = None
 ) -> tf.data.Dataset:
     """Potentially distributes a dataset."""
     if self._enable_tf_data_service and input_context:
         if self._enable_round_robin_tf_data_service:
             replicas_per_input_pipeline = input_context.num_replicas_in_sync // (
                 input_context.num_input_pipelines)
             base_consumer_index = input_context.input_pipeline_id * (
                 replicas_per_input_pipeline)
             num_consumers = input_context.num_input_pipelines * (
                 replicas_per_input_pipeline)
             range_dataset = tf.data.Dataset.range(
                 replicas_per_input_pipeline)
             tfds_kwargs = {
                 'processing_mode': 'parallel_epochs',
                 'service': self._tf_data_service_address,
                 'job_name': self._tf_data_service_job_name,
                 'num_consumers': num_consumers
             }
             if self._enable_shared_tf_data_service_between_parallel_trainers:
                 raise ValueError(
                     'Shared tf.data service does not support round-robin'
                     ' tf.data service.')
             dataset = range_dataset.map(lambda i: dataset.apply(  # pylint: disable=g-long-lambda
                 tf.data.experimental.service.
                 distribute(consumer_index=base_consumer_index + i,
                            **tfds_kwargs)))
             # Use parallel interleave to read multiple batches from a tf.data
             # service worker in parallel.
             dataset = dataset.interleave(
                 lambda x: x,
                 cycle_length=replicas_per_input_pipeline,
                 num_parallel_calls=replicas_per_input_pipeline,
                 deterministic=True)
         else:
             tfds_kwargs = {
                 'processing_mode': 'parallel_epochs',
                 'service': self._tf_data_service_address,
                 'job_name': self._tf_data_service_job_name,
             }
             if self._enable_shared_tf_data_service_between_parallel_trainers:
                 tfds_kwargs.update({
                     'processing_mode':
                     tf.data.experimental.service.ShardingPolicy.OFF,
                     'cross_trainer_cache':
                     tf.data.experimental.service.CrossTrainerCache(
                         trainer_id=self._trainer_id)
                 })
             dataset = dataset.apply(
                 tf.data.experimental.service.distribute(**tfds_kwargs))
     return dataset
示例#3
0
文件: dataset.py 项目: deepmind/acme
def episodes_to_timestep_batched_transitions(
        episode_dataset: tf.data.Dataset,
        return_horizon: int = 10,
        drop_return_horizon: bool = False,
        min_return_filter: Optional[float] = None) -> tf.data.Dataset:
    """Process an existing dataset converting it to episode to 3-transitions.

  A 3-transition is an Transition with each attribute having an extra dimension
  of size 3, representing 3 consecutive timesteps. Each 3-step object will be
  in random order relative to each other.  See `episode_to_timestep_batch` for
  more information.

  Args:
    episode_dataset: An RLDS dataset to process.
    return_horizon: The horizon we want calculate Monte-Carlo returns to.
    drop_return_horizon: Whether we should drop the last `return_horizon` steps.
    min_return_filter: Minimum episode return below which we drop an episode.

  Returns:
    A tf.data.Dataset of 3-transitions.
  """
    dataset = episode_dataset.interleave(
        functools.partial(episode_to_timestep_batch,
                          return_horizon=return_horizon,
                          drop_return_horizon=drop_return_horizon,
                          calculate_episode_return=min_return_filter
                          is not None),
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
        deterministic=False)

    if min_return_filter is not None:

        def filter_on_return(step):
            return step[EPISODE_RETURN][0][0] > min_return_filter

        dataset = dataset.filter(filter_on_return)

    dataset = dataset.map(_step_to_transition,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    return dataset
示例#4
0
 def _maybe_apply_data_service(
     self,
     dataset: tf.data.Dataset,
     input_context: Optional[tf.distribute.InputContext] = None
 ) -> tf.data.Dataset:
     """Potentially distributes a dataset."""
     if self._enable_tf_data_service and input_context:
         if self._enable_round_robin_tf_data_service:
             replicas_per_input_pipeline = input_context.num_replicas_in_sync // (
                 input_context.num_input_pipelines)
             base_consumer_index = input_context.input_pipeline_id * (
                 replicas_per_input_pipeline)
             num_consumers = input_context.num_input_pipelines * (
                 replicas_per_input_pipeline)
             range_dataset = tf.data.Dataset.range(
                 replicas_per_input_pipeline)
             dataset = range_dataset.map(lambda i: dataset.apply(  # pylint: disable=g-long-lambda
                 tf.data.experimental.service.distribute(
                     processing_mode='parallel_epochs',
                     service=self._tf_data_service_address,
                     job_name=self._tf_data_service_job_name,
                     consumer_index=base_consumer_index + i,
                     num_consumers=num_consumers)))
             # Use parallel interleave to read multiple batches from a tf.data
             # service worker in parallel.
             dataset = dataset.interleave(
                 lambda x: x,
                 cycle_length=replicas_per_input_pipeline,
                 num_parallel_calls=replicas_per_input_pipeline,
                 deterministic=True)
         else:
             dataset = dataset.apply(
                 tf.data.experimental.service.distribute(
                     processing_mode='parallel_epochs',
                     service=self._tf_data_service_address,
                     job_name=self._tf_data_service_job_name))
     return dataset
示例#5
0
    def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
        """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
        if (self.config.builder != 'tfds' and self.input_context
                and self.input_context.num_input_pipelines > 1):
            dataset = dataset.shard(self.input_context.num_input_pipelines,
                                    self.input_context.input_pipeline_id)
            logging.info(
                'Sharding the dataset: input_pipeline_id=%d '
                'num_input_pipelines=%d',
                self.input_context.num_input_pipelines,
                self.input_context.input_pipeline_id)

        if self.is_training and self.config.builder == 'records':
            # Shuffle the input files.
            dataset.shuffle(buffer_size=self.config.file_shuffle_buffer_size)

        if self.is_training and not self.config.cache:
            dataset = dataset.repeat()

        if self.config.builder == 'records':
            # Read the data from disk in parallel
            dataset = dataset.interleave(
                tf.data.TFRecordDataset,
                cycle_length=10,
                block_length=1,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if self.config.cache:
            dataset = dataset.cache()

        if self.is_training:
            dataset = dataset.shuffle(self.config.shuffle_buffer_size)
            dataset = dataset.repeat()

        # Parse, pre-process, and batch the data in parallel
        if self.config.builder == 'records':
            preprocess = self.parse_record
        else:
            preprocess = self.preprocess
        dataset = dataset.map(preprocess,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if self.input_context and self.config.num_devices > 1:
            if not self.config.use_per_replica_batch_size:
                raise ValueError(
                    'The builder does not support a global batch size with more than '
                    'one replica. Got {} replicas. Please set a '
                    '`per_replica_batch_size` and enable '
                    '`use_per_replica_batch_size=True`.'.format(
                        self.config.num_devices))

            # The batch size of the dataset will be multiplied by the number of
            # replicas automatically when strategy.distribute_datasets_from_function
            # is called, so we use local batch size here.
            dataset = dataset.batch(self.local_batch_size,
                                    drop_remainder=self.is_training)
        else:
            dataset = dataset.batch(self.global_batch_size,
                                    drop_remainder=self.is_training)

        # Prefetch overlaps in-feed with training
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        if self.config.tf_data_service:
            if not hasattr(tf.data.experimental, 'service'):
                raise ValueError(
                    'The tf_data_service flag requires Tensorflow version '
                    '>= 2.3.0, but the version is {}'.format(tf.__version__))
            dataset = dataset.apply(
                tf.data.experimental.service.distribute(
                    processing_mode='parallel_epochs',
                    service=self.config.tf_data_service,
                    job_name='resnet_train'))
            dataset = dataset.prefetch(
                buffer_size=tf.data.experimental.AUTOTUNE)

        return dataset
    def pipeline(
            self,
            dataset: tf.data.Dataset,
            input_context: tf.distribute.InputContext = None
    ) -> tf.data.Dataset:
        """Build a pipeline fetching, shuffling, and preprocessing the dataset.

        Args:
          dataset: A `tf.data.Dataset` that loads raw files.
          input_context: An optional context provided by `tf.distribute` for
            cross-replica training. This isn't necessary if using Keras
            compile/fit.

        Returns:
          A TensorFlow dataset outputting batched images and labels.
        """
        if input_context and input_context.num_input_pipelines > 1:
            dataset = dataset.shard(input_context.num_input_pipelines,
                                    input_context.input_pipeline_id)

        if self.is_training and not self.config.cache:
            dataset = dataset.repeat()

        if self.config.builder == 'records':
            # Read the data from disk in parallel
            buffer_size = 8 * 1024 * 1024  # Use 8 MiB per file
            dataset = dataset.interleave(
                lambda name: tf.data.TFRecordDataset(name,
                                                     buffer_size=buffer_size),
                cycle_length=16,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

        dataset = dataset.prefetch(self.global_batch_size)

        if self.config.cache:
            dataset = dataset.cache()

        if self.is_training:
            dataset = dataset.shuffle(self.config.shuffle_buffer_size)
            dataset = dataset.repeat()

        # Parse, pre-process, and batch the data in parallel
        if self.config.builder == 'records':
            preprocess = self.parse_record
        else:
            preprocess = self.preprocess
        dataset = dataset.map(preprocess,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

        dataset = dataset.batch(self.batch_size,
                                drop_remainder=self.is_training)

        # Note: we could do image normalization here, but we defer it to the model
        # which can perform it much faster on a GPU/TPU
        # TODO(dankondratyuk): if we fix prefetching, we can do it here

        if self.is_training and self.config.deterministic_train is not None:
            options = tf.data.Options()
            options.experimental_deterministic = self.config.deterministic_train
            options.experimental_slack = self.config.use_slack
            options.experimental_optimization.parallel_batch = True
            options.experimental_optimization.map_fusion = True
            options.experimental_optimization.map_vectorization.enabled = True
            options.experimental_optimization.map_parallelization = True
            dataset = dataset.with_options(options)

        # Prefetch overlaps in-feed with training
        # Note: autotune here is not recommended, as this can lead to memory leaks.
        # Instead, use a constant prefetch size like the the number of devices.
        dataset = dataset.prefetch(self.config.num_devices)

        return dataset
示例#7
0
    def pipeline(
            self,
            dataset: tf.data.Dataset,
            input_context: tf.distribute.InputContext = None
    ) -> tf.data.Dataset:
        """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.
      input_context: An optional context provided by `tf.distribute` for
        cross-replica training. If set with more than one replica, this
        function assumes `use_per_replica_batch_size=True`.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
        if input_context and input_context.num_input_pipelines > 1:
            dataset = dataset.shard(input_context.num_input_pipelines,
                                    input_context.input_pipeline_id)

        if self.is_training and not self.config.cache:
            dataset = dataset.repeat()

        if self.config.builder == 'records':
            # Read the data from disk in parallel
            buffer_size = 8 * 1024 * 1024  # Use 8 MiB per file
            dataset = dataset.interleave(
                lambda name: tf.data.TFRecordDataset(name,
                                                     buffer_size=buffer_size),
                cycle_length=16,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if self.config.cache:
            dataset = dataset.cache()

        if self.is_training:
            dataset = dataset.shuffle(self.config.shuffle_buffer_size)
            dataset = dataset.repeat()

        # Parse, pre-process, and batch the data in parallel
        if self.config.builder == 'records':
            preprocess = self.parse_record
        else:
            preprocess = self.preprocess
        dataset = dataset.map(preprocess,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if input_context and self.config.num_devices > 1:
            if not self.config.use_per_replica_batch_size:
                raise ValueError(
                    'The builder does not support a global batch size with more than '
                    'one replica. Got {} replicas. Please set a '
                    '`per_replica_batch_size` and enable '
                    '`use_per_replica_batch_size=True`.'.format(
                        self.config.num_devices))

            # The batch size of the dataset will be multiplied by the number of
            # replicas automatically when strategy.distribute_datasets_from_function
            # is called, so we use local batch size here.
            dataset = dataset.batch(self.local_batch_size,
                                    drop_remainder=self.is_training)
        else:
            dataset = dataset.batch(self.global_batch_size,
                                    drop_remainder=self.is_training)

        if self.is_training:
            options = tf.data.Options()
            options.experimental_deterministic = self.config.deterministic_train
            options.experimental_slack = self.config.use_slack
            options.experimental_optimization.parallel_batch = True
            options.experimental_optimization.map_fusion = True
            options.experimental_optimization.map_vectorization.enabled = True
            options.experimental_optimization.map_parallelization = True
            dataset = dataset.with_options(options)

        # Prefetch overlaps in-feed with training
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset
示例#8
0
 def transform(self, ds: tf.data.Dataset) -> tf.data.Dataset:
     ds = ds.interleave(self.item_to_dataset,
                        cycle_length=self.cycle_length,
                        block_length=self.block_length,
                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
     return ds
    def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
        """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
        # This can help resolve OOM issues when using only 1 GPU for training
        options = tf.data.Options()
        options.experimental_optimization.map_parallelization = (
            not self.disable_map_parallelization)
        dataset = dataset.with_options(options)

        if self._num_gpus > 1:
            # For multi-host training, we want each hosts to always process the same
            # subset of files.  Each host only sees a subset of the entire dataset,
            # allowing us to cache larger datasets in memory.
            dataset = dataset.shard(self._num_gpus, hvd.rank())

        if self.is_training:
            # Shuffle the input files.
            dataset.shuffle(buffer_size=self._file_shuffle_buffer_size)

        if self.is_training and not self._cache:
            dataset = dataset.repeat()

        # Read the data from disk in parallel
        dataset = dataset.interleave(
            tf.data.TFRecordDataset,
            cycle_length=10,
            block_length=1,
            num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if self._cache:
            dataset = dataset.cache()

        if self.is_training:
            dataset = dataset.shuffle(self._shuffle_buffer_size)
            dataset = dataset.repeat()

        # Parse, pre-process, and batch the data in parallel
        preprocess = self.parse_record
        dataset = dataset.map(preprocess,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
        if self._num_gpus > 1:
            # The batch size of the dataset will be multiplied by the number of
            # replicas automatically when strategy.distribute_datasets_from_function
            # is called, so we use local batch size here.
            dataset = dataset.batch(self.local_batch_size,
                                    drop_remainder=self.is_training)
        else:
            dataset = dataset.batch(self.global_batch_size,
                                    drop_remainder=self.is_training)

        # apply Mixup/CutMix only during training, if requested in the data pipeline,
        # otherwise they will be applied in the model module on device
        mixup_alpha = self.mixup_alpha if self.is_training else 0.0
        cutmix_alpha = self.cutmix_alpha if self.is_training else 0.0
        dataset = dataset.map(functools.partial(mixing, self.local_batch_size,
                                                mixup_alpha, cutmix_alpha,
                                                self.defer_img_mixing),
                              num_parallel_calls=64)

        # Assign static batch size dimension
        # dataset = dataset.map(
        #     functools.partial(self.set_shapes, batch_size),
        #     num_parallel_calls=tf.data.experimental.AUTOTUNE)

        # Prefetch overlaps in-feed with training
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset