def pipeline( self, dataset: tf.data.Dataset, input_context: tf.distribute.InputContext = None ) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. input_context: An optional context provided by `tf.distribute` for cross-replica training. If set with more than one replica, this function assumes `use_per_replica_batch_size=True`. Returns: A TensorFlow dataset outputting batched images and labels. """ if input_context and input_context.num_input_pipelines > 1: dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) if self.is_training and not self.config.cache: dataset = dataset.repeat() if self.config.builder == 'records': # Read the data from disk in parallel buffer_size = 8 * 1024 * 1024 # Use 8 MiB per file dataset = dataset.interleave( lambda name: tf.data.TFRecordDataset(name, buffer_size=buffer_size), cycle_length=16, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self.config.cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self.config.shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel if self.config.builder == 'records': preprocess = self.parse_record else: preprocess = self.preprocess dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if input_context and self.config.num_devices > 1: if not self.config.use_per_replica_batch_size: raise ValueError( 'The builder does not support a global batch size with more than ' 'one replica. Got {} replicas. Please set a ' '`per_replica_batch_size` and enable ' '`use_per_replica_batch_size=True`.'.format( self.config.num_devices)) # The batch size of the dataset will be multiplied by the number of # replicas automatically when strategy.distribute_datasets_from_function # is called, so we use local batch size here. dataset = dataset.batch(self.local_batch_size, drop_remainder=self.is_training) else: dataset = dataset.batch(self.global_batch_size, drop_remainder=self.is_training) if self.is_training: options = tf.data.Options() options.experimental_deterministic = self.config.deterministic_train options.experimental_slack = self.config.use_slack options.experimental_optimization.parallel_batch = True options.experimental_optimization.map_fusion = True options.experimental_optimization.map_vectorization.enabled = True options.experimental_optimization.map_parallelization = True dataset = dataset.with_options(options) # Prefetch overlaps in-feed with training dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def pipeline( self, dataset: tf.data.Dataset, input_context: tf.distribute.InputContext = None ) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. input_context: An optional context provided by `tf.distribute` for cross-replica training. This isn't necessary if using Keras compile/fit. Returns: A TensorFlow dataset outputting batched images and labels. """ if input_context and input_context.num_input_pipelines > 1: dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) if self.is_training and not self.config.cache: dataset = dataset.repeat() if self.config.builder == 'records': # Read the data from disk in parallel buffer_size = 8 * 1024 * 1024 # Use 8 MiB per file dataset = dataset.interleave( lambda name: tf.data.TFRecordDataset(name, buffer_size=buffer_size), cycle_length=16, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(self.global_batch_size) if self.config.cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self.config.shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel if self.config.builder == 'records': preprocess = self.parse_record else: preprocess = self.preprocess dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(self.batch_size, drop_remainder=self.is_training) # Note: we could do image normalization here, but we defer it to the model # which can perform it much faster on a GPU/TPU # TODO(dankondratyuk): if we fix prefetching, we can do it here if self.is_training and self.config.deterministic_train is not None: options = tf.data.Options() options.experimental_deterministic = self.config.deterministic_train options.experimental_slack = self.config.use_slack options.experimental_optimization.parallel_batch = True options.experimental_optimization.map_fusion = True options.experimental_optimization.map_vectorization.enabled = True options.experimental_optimization.map_parallelization = True dataset = dataset.with_options(options) # Prefetch overlaps in-feed with training # Note: autotune here is not recommended, as this can lead to memory leaks. # Instead, use a constant prefetch size like the the number of devices. dataset = dataset.prefetch(self.config.num_devices) return dataset
def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. Returns: A TensorFlow dataset outputting batched images and labels. """ # This can help resolve OOM issues when using only 1 GPU for training options = tf.data.Options() options.experimental_optimization.map_parallelization = ( not self.disable_map_parallelization) dataset = dataset.with_options(options) if self._num_gpus > 1: # For multi-host training, we want each hosts to always process the same # subset of files. Each host only sees a subset of the entire dataset, # allowing us to cache larger datasets in memory. dataset = dataset.shard(self._num_gpus, hvd.rank()) if self.is_training: # Shuffle the input files. dataset.shuffle(buffer_size=self._file_shuffle_buffer_size) if self.is_training and not self._cache: dataset = dataset.repeat() # Read the data from disk in parallel dataset = dataset.interleave( tf.data.TFRecordDataset, cycle_length=10, block_length=1, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self._shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel preprocess = self.parse_record dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._num_gpus > 1: # The batch size of the dataset will be multiplied by the number of # replicas automatically when strategy.distribute_datasets_from_function # is called, so we use local batch size here. dataset = dataset.batch(self.local_batch_size, drop_remainder=self.is_training) else: dataset = dataset.batch(self.global_batch_size, drop_remainder=self.is_training) # apply Mixup/CutMix only during training, if requested in the data pipeline, # otherwise they will be applied in the model module on device mixup_alpha = self.mixup_alpha if self.is_training else 0.0 cutmix_alpha = self.cutmix_alpha if self.is_training else 0.0 dataset = dataset.map(functools.partial(mixing, self.local_batch_size, mixup_alpha, cutmix_alpha, self.defer_img_mixing), num_parallel_calls=64) # Assign static batch size dimension # dataset = dataset.map( # functools.partial(self.set_shapes, batch_size), # num_parallel_calls=tf.data.experimental.AUTOTUNE) # Prefetch overlaps in-feed with training dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset