示例#1
0
 def apply(self, dataset: tf.data.Dataset, mode: str = None):
     # pylint: disable=unused-argument
     if mode is not None and self.modes is not None and mode not in self.modes:
         LOGGER.info(f"Not applying {self} (mode={mode})")
         return dataset
     if self.filename:
         return dataset.cache(self.filename)
     else:
         return dataset.cache()
示例#2
0
 def transform(self, ds: tf.data.Dataset) -> tf.data.Dataset:
     if self.cache_dir is None:
         return ds
     elif self.cache_dir == "":
         log.info("Using memory cache for %s", ds)
         ds = ds.cache()
     else:
         cache_path = os.path.join(self.cache_dir, "cache")
         log.info("Using cache path:[%s] for %s", self.cache_dir, ds)
         tf.io.gfile.makedirs(os.path.dirname(cache_path))
         ds = ds.cache(cache_path)
     return ds
示例#3
0
    def process(self, dataset: tf.data.Dataset, batch_size: int):
        dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE)

        if self.cache:
            dataset = dataset.cache()

        if self.shuffle:
            dataset = dataset.shuffle(self.buffer_size,
                                      reshuffle_each_iteration=True)

        # PADDED BATCH the dataset
        dataset = dataset.padded_batch(
            batch_size=batch_size,
            padded_shapes=(
                tf.TensorShape([]),
                tf.TensorShape(self.speech_featurizer.shape),
                tf.TensorShape([]),
                tf.TensorShape([None]),
                tf.TensorShape([]),
                tf.TensorShape([None]),
                tf.TensorShape([]),
            ),
            padding_values=("", 0., 0, self.text_featurizer.blank, 0,
                            self.text_featurizer.blank, 0),
            drop_remainder=self.drop_remainder)

        # PREFETCH to improve speed of input length
        dataset = dataset.prefetch(AUTOTUNE)
        self.total_steps = get_num_batches(self.total_steps, batch_size)
        return dataset
示例#4
0
  def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
    """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
    if self._num_gpus > 1:
      dataset = dataset.shard(self._num_gpus, hvd.rank())

    if self.is_training:
      # Shuffle the input files.
      dataset.shuffle(buffer_size=self._file_shuffle_buffer_size)

    if self.is_training and not self._cache:
      dataset = dataset.repeat()

    # Read the data from disk in parallel
    dataset = dataset.interleave(
        tf.data.TFRecordDataset,
        cycle_length=10,
        block_length=1,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if self._cache:
      dataset = dataset.cache()

    if self.is_training:
      dataset = dataset.shuffle(self._shuffle_buffer_size)
      dataset = dataset.repeat()

    # Parse, pre-process, and batch the data in parallel
    preprocess = self.parse_record
    dataset = dataset.map(preprocess,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if self._num_gpus > 1:
      # The batch size of the dataset will be multiplied by the number of
      # replicas automatically when strategy.distribute_datasets_from_function
      # is called, so we use local batch size here.
      dataset = dataset.batch(self.local_batch_size,
                              drop_remainder=self.is_training)
    else:
      dataset = dataset.batch(self.global_batch_size,
                              drop_remainder=self.is_training)

    # Apply Mixup
    mixup_alpha = self.mixup_alpha if self.is_training else 0.0
    dataset = dataset.map(
        functools.partial(self.mixup, self.local_batch_size, mixup_alpha),
        num_parallel_calls=64)

    # Prefetch overlaps in-feed with training
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
示例#5
0
 def prepare_ds(dataset: tf.data.Dataset,
                config: HyperparameterDict) -> tf.data.Dataset:
     # Cast to float
     dataset = dataset.map(lambda x: tf.cast(x, tf.float32),
                           num_parallel_calls=tf.data.experimental.AUTOTUNE)
     dataset = dataset.map(lambda x: config['rescaling'](x),
                           num_parallel_calls=tf.data.experimental.AUTOTUNE)
     dataset = dataset.map(config['resizing'],
                           num_parallel_calls=tf.data.experimental.AUTOTUNE)
     if config['cache_data']:
         dataset.cache(
         )  # As the dataset fit in memory, cache before shuffling for better performance.
     dataset = dataset.shuffle(
         1000
     )  # For true randomness, set the shuffle buffer to the full dataset size.
     dataset = dataset.batch(config['batch_size'])
     dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
     return dataset
def prepare_for_testing(data_set: tf.data.Dataset, batch_size, cache_path=''):
    if cache_path != '':
        cache_filename = 'dataset_test.tfcache'
        data_set = data_set.cache(''.join([cache_path, '/', cache_filename]))

    data_set = data_set.repeat()
    data_set = data_set.batch(batch_size=batch_size)

    return data_set
 def create_dataset(self, dataset: tf.data.Dataset,
                    input_columns, output_columns,
                    batch_size: int, use_cache: bool):
     dataset = self.add_feature_columns_to_dataset(dataset, input_columns, output_columns)
     if use_cache:
         dataset = dataset.cache("cache").repeat()
     dataset = dataset.shuffle(1000, reshuffle_each_iteration=True)
     dataset = dataset.batch(batch_size, drop_remainder=True)
     return dataset
def _prepare_test_dataset(dataset: tf.data.Dataset, batch_size, cache_path=''):
    if cache_path != '':
        cache_filename = 'dataset_test.tfcache'
        dataset = dataset.cache(
            os.path.join(opt.data_path, cache_path, cache_filename))
        # dataset = dataset.cache(''.join([cache_path, '/', cache_filename]))

    dataset = dataset.repeat()
    dataset = dataset.batch(batch_size=batch_size)

    return dataset
def prepare_for_training(data_set: tf.data.Dataset,
                         batch_size,
                         cache_path=None,
                         shuffle_buffer_size=1000):
    if cache_path != '':
        cache_filename = 'dataset_train.tfcache'
        data_set = data_set.cache(''.join([cache_path, '/', cache_filename]))

    data_set = data_set.shuffle(buffer_size=shuffle_buffer_size)
    # repeat forever
    data_set = data_set.repeat()
    data_set = data_set.batch(batch_size=batch_size)
    # `prefetch` lets the dataset fetch batches in the background
    # while the model is training.
    data_set = data_set.prefetch(buffer_size=AUTOTUNE)

    return data_set
示例#10
0
    def cache(self, data: tf.data.Dataset) -> tf.data.Dataset:
        """
        Cache the dataset

        Parameters
        ----------
        data
            tensorflow dataset to cache

        Returns
        -------
        data_cached
            cached data
        """
        self._cache_file = _get_cache_fname(
            self.cache_dir,
            "-".join([self.__class__.__name__, self._subtype,
                      str(self.mode)]))
        data = data.cache(self._cache_file)
        return data
示例#11
0
    def transform_dataset(self, ds_input: tf.data.Dataset) -> tf.data.Dataset:
        """Create a dataset that generates preloaded elements.

        Args:
            ds_input: Any `tf.data.Dataset` that generates examples as a dictionary of
                tensors. Should not be repeating infinitely.

        Return:
            A dataset that generates the same examples.

            This is similar to prefetching, except that examples are yielded through a
            generator and loaded when this method is called rather than during pipeline
            iteration.
        """
        ds = ds_input.cache()

        # Preload examples from the input dataset and populate cache.
        self.examples = list(iter(ds))

        return ds
 def _optimize_dataset(dataset: tf.data.Dataset) -> tf.data.Dataset:
     """Return a dataset with caching and prefetching enabled."""
     return dataset.cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
示例#13
0
    def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
        """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
        if (self.config.builder != 'tfds' and self.input_context
                and self.input_context.num_input_pipelines > 1):
            dataset = dataset.shard(self.input_context.num_input_pipelines,
                                    self.input_context.input_pipeline_id)
            logging.info(
                'Sharding the dataset: input_pipeline_id=%d '
                'num_input_pipelines=%d',
                self.input_context.num_input_pipelines,
                self.input_context.input_pipeline_id)

        if self.is_training and self.config.builder == 'records':
            # Shuffle the input files.
            dataset.shuffle(buffer_size=self.config.file_shuffle_buffer_size)

        if self.is_training and not self.config.cache:
            dataset = dataset.repeat()

        if self.config.builder == 'records':
            # Read the data from disk in parallel
            dataset = dataset.interleave(
                tf.data.TFRecordDataset,
                cycle_length=10,
                block_length=1,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if self.config.cache:
            dataset = dataset.cache()

        if self.is_training:
            dataset = dataset.shuffle(self.config.shuffle_buffer_size)
            dataset = dataset.repeat()

        # Parse, pre-process, and batch the data in parallel
        if self.config.builder == 'records':
            preprocess = self.parse_record
        else:
            preprocess = self.preprocess
        dataset = dataset.map(preprocess,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if self.input_context and self.config.num_devices > 1:
            if not self.config.use_per_replica_batch_size:
                raise ValueError(
                    'The builder does not support a global batch size with more than '
                    'one replica. Got {} replicas. Please set a '
                    '`per_replica_batch_size` and enable '
                    '`use_per_replica_batch_size=True`.'.format(
                        self.config.num_devices))

            # The batch size of the dataset will be multiplied by the number of
            # replicas automatically when strategy.distribute_datasets_from_function
            # is called, so we use local batch size here.
            dataset = dataset.batch(self.local_batch_size,
                                    drop_remainder=self.is_training)
        else:
            dataset = dataset.batch(self.global_batch_size,
                                    drop_remainder=self.is_training)

        # Prefetch overlaps in-feed with training
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        if self.config.tf_data_service:
            if not hasattr(tf.data.experimental, 'service'):
                raise ValueError(
                    'The tf_data_service flag requires Tensorflow version '
                    '>= 2.3.0, but the version is {}'.format(tf.__version__))
            dataset = dataset.apply(
                tf.data.experimental.service.distribute(
                    processing_mode='parallel_epochs',
                    service=self.config.tf_data_service,
                    job_name='resnet_train'))
            dataset = dataset.prefetch(
                buffer_size=tf.data.experimental.AUTOTUNE)

        return dataset
示例#14
0
    def pipeline(
            self,
            dataset: tf.data.Dataset,
            input_context: tf.distribute.InputContext = None
    ) -> tf.data.Dataset:
        """Build a pipeline fetching, shuffling, and preprocessing the dataset.

        Args:
          dataset: A `tf.data.Dataset` that loads raw files.
          input_context: An optional context provided by `tf.distribute` for
            cross-replica training. This isn't necessary if using Keras
            compile/fit.

        Returns:
          A TensorFlow dataset outputting batched images and labels.
        """
        if input_context and input_context.num_input_pipelines > 1:
            dataset = dataset.shard(input_context.num_input_pipelines,
                                    input_context.input_pipeline_id)

        if self.is_training and not self.config.cache:
            dataset = dataset.repeat()

        if self.config.builder == 'records':
            # Read the data from disk in parallel
            buffer_size = 8 * 1024 * 1024  # Use 8 MiB per file
            dataset = dataset.interleave(
                lambda name: tf.data.TFRecordDataset(name,
                                                     buffer_size=buffer_size),
                cycle_length=16,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

        dataset = dataset.prefetch(self.global_batch_size)

        if self.config.cache:
            dataset = dataset.cache()

        if self.is_training:
            dataset = dataset.shuffle(self.config.shuffle_buffer_size)
            dataset = dataset.repeat()

        # Parse, pre-process, and batch the data in parallel
        if self.config.builder == 'records':
            preprocess = self.parse_record
        else:
            preprocess = self.preprocess
        dataset = dataset.map(preprocess,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

        dataset = dataset.batch(self.batch_size,
                                drop_remainder=self.is_training)

        # Note: we could do image normalization here, but we defer it to the model
        # which can perform it much faster on a GPU/TPU
        # TODO(dankondratyuk): if we fix prefetching, we can do it here

        if self.is_training and self.config.deterministic_train is not None:
            options = tf.data.Options()
            options.experimental_deterministic = self.config.deterministic_train
            options.experimental_slack = self.config.use_slack
            options.experimental_optimization.parallel_batch = True
            options.experimental_optimization.map_fusion = True
            options.experimental_optimization.map_vectorization.enabled = True
            options.experimental_optimization.map_parallelization = True
            dataset = dataset.with_options(options)

        # Prefetch overlaps in-feed with training
        # Note: autotune here is not recommended, as this can lead to memory leaks.
        # Instead, use a constant prefetch size like the the number of devices.
        dataset = dataset.prefetch(self.config.num_devices)

        return dataset
示例#15
0
def _optimize_dataset(ds: tf.data.Dataset):
    return ds.cache().prefetch(buffer_size=PREFETCH_BUFFER_SIZE)
    def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
        """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
        # This can help resolve OOM issues when using only 1 GPU for training
        options = tf.data.Options()
        options.experimental_optimization.map_parallelization = (
            not self.disable_map_parallelization)
        dataset = dataset.with_options(options)

        if self._num_gpus > 1:
            # For multi-host training, we want each hosts to always process the same
            # subset of files.  Each host only sees a subset of the entire dataset,
            # allowing us to cache larger datasets in memory.
            dataset = dataset.shard(self._num_gpus, hvd.rank())

        if self.is_training:
            # Shuffle the input files.
            dataset.shuffle(buffer_size=self._file_shuffle_buffer_size)

        if self.is_training and not self._cache:
            dataset = dataset.repeat()

        # Read the data from disk in parallel
        dataset = dataset.interleave(
            tf.data.TFRecordDataset,
            cycle_length=10,
            block_length=1,
            num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if self._cache:
            dataset = dataset.cache()

        if self.is_training:
            dataset = dataset.shuffle(self._shuffle_buffer_size)
            dataset = dataset.repeat()

        # Parse, pre-process, and batch the data in parallel
        preprocess = self.parse_record
        dataset = dataset.map(preprocess,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
        if self._num_gpus > 1:
            # The batch size of the dataset will be multiplied by the number of
            # replicas automatically when strategy.distribute_datasets_from_function
            # is called, so we use local batch size here.
            dataset = dataset.batch(self.local_batch_size,
                                    drop_remainder=self.is_training)
        else:
            dataset = dataset.batch(self.global_batch_size,
                                    drop_remainder=self.is_training)

        # apply Mixup/CutMix only during training, if requested in the data pipeline,
        # otherwise they will be applied in the model module on device
        mixup_alpha = self.mixup_alpha if self.is_training else 0.0
        cutmix_alpha = self.cutmix_alpha if self.is_training else 0.0
        dataset = dataset.map(functools.partial(mixing, self.local_batch_size,
                                                mixup_alpha, cutmix_alpha,
                                                self.defer_img_mixing),
                              num_parallel_calls=64)

        # Assign static batch size dimension
        # dataset = dataset.map(
        #     functools.partial(self.set_shapes, batch_size),
        #     num_parallel_calls=tf.data.experimental.AUTOTUNE)

        # Prefetch overlaps in-feed with training
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset
示例#17
0
    def pipeline(
            self,
            dataset: tf.data.Dataset,
            input_context: tf.distribute.InputContext = None
    ) -> tf.data.Dataset:
        """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.
      input_context: An optional context provided by `tf.distribute` for
        cross-replica training. If set with more than one replica, this
        function assumes `use_per_replica_batch_size=True`.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
        if input_context and input_context.num_input_pipelines > 1:
            dataset = dataset.shard(input_context.num_input_pipelines,
                                    input_context.input_pipeline_id)

        if self.is_training and not self.config.cache:
            dataset = dataset.repeat()

        if self.config.builder == 'records':
            # Read the data from disk in parallel
            buffer_size = 8 * 1024 * 1024  # Use 8 MiB per file
            dataset = dataset.interleave(
                lambda name: tf.data.TFRecordDataset(name,
                                                     buffer_size=buffer_size),
                cycle_length=16,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if self.config.cache:
            dataset = dataset.cache()

        if self.is_training:
            dataset = dataset.shuffle(self.config.shuffle_buffer_size)
            dataset = dataset.repeat()

        # Parse, pre-process, and batch the data in parallel
        if self.config.builder == 'records':
            preprocess = self.parse_record
        else:
            preprocess = self.preprocess
        dataset = dataset.map(preprocess,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if input_context and self.config.num_devices > 1:
            if not self.config.use_per_replica_batch_size:
                raise ValueError(
                    'The builder does not support a global batch size with more than '
                    'one replica. Got {} replicas. Please set a '
                    '`per_replica_batch_size` and enable '
                    '`use_per_replica_batch_size=True`.'.format(
                        self.config.num_devices))

            # The batch size of the dataset will be multiplied by the number of
            # replicas automatically when strategy.distribute_datasets_from_function
            # is called, so we use local batch size here.
            dataset = dataset.batch(self.local_batch_size,
                                    drop_remainder=self.is_training)
        else:
            dataset = dataset.batch(self.global_batch_size,
                                    drop_remainder=self.is_training)

        if self.is_training:
            options = tf.data.Options()
            options.experimental_deterministic = self.config.deterministic_train
            options.experimental_slack = self.config.use_slack
            options.experimental_optimization.parallel_batch = True
            options.experimental_optimization.map_fusion = True
            options.experimental_optimization.map_vectorization.enabled = True
            options.experimental_optimization.map_parallelization = True
            dataset = dataset.with_options(options)

        # Prefetch overlaps in-feed with training
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset
示例#18
0
文件: core.py 项目: jackd/kblocks
 def transform(dataset: tf.data.Dataset):
     return dataset.cache(filename)