def gen_dataset(self, batch_size=1, is_training=False, shuffle=False, input_pipeline_context=None, preprocess=None, drop_remainder=False): """Generate a shared and batched tf.data.Dataset for training/evaluation. Args: batch_size: A integer, the returned dataset will be batched by this size. is_training: A boolean, when True, the returned dataset will be optionally shuffled and repeated as an endless dataset. shuffle: A boolean, when True, the returned dataset will be shuffled to create randomness during model training. input_pipeline_context: A InputContext instance, used to shared dataset among multiple workers when distribution strategy is used. preprocess: A function taking three arguments in order, feature, label and boolean is_training. drop_remainder: boolean, whether the finaly batch drops remainder. Returns: A TF dataset ready to be consumed by Keras model. """ ds = self._dataset ds = dataloader.shard(ds, input_pipeline_context) if preprocess: preprocess = functools.partial(preprocess, is_training=is_training) ds = ds.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if is_training: if shuffle: # Shuffle size should be bigger than the batch_size. Otherwise it's only # shuffling within the batch, which equals to not having shuffle. buffer_size = 3 * batch_size # But since we are doing shuffle before repeat, it doesn't make sense to # shuffle more than total available entries. # TODO(wangtz): Do we want to do shuffle before / after repeat? # Shuffle after repeat will give a more randomized dataset and mix the # epoch boundary: https://www.tensorflow.org/guide/data ds = ds.shuffle(buffer_size=min(self._size, buffer_size)) ds = ds.batch(batch_size, drop_remainder=drop_remainder) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) # TODO(b/171449557): Consider converting ds to distributed ds here. return ds
def gen_dataset(self, batch_size=1, is_training=False, shuffle=False, input_pipeline_context=None, preprocess=None, drop_remainder=False): """Generate a shared and batched tf.data.Dataset for training/evaluation. Args: batch_size: A integer, the returned dataset will be batched by this size. is_training: A boolean, when True, the returned dataset will be optionally shuffled and repeated as an endless dataset. shuffle: A boolean, when True, the returned dataset will be shuffled to create randomness during model training. input_pipeline_context: A InputContext instance, used to shared dataset among multiple workers when distribution strategy is used. preprocess: A function taking three arguments in order, feature, label and boolean is_training. drop_remainder: boolean, whether the finaly batch drops remainder. Returns: A TF dataset ready to be consumed by Keras model. """ # This argument is only used for image dataset for now. Audio preprocessing # is defined in the spec. _ = preprocess ds = self._dataset spec = self._spec autotune = tf.data.AUTOTUNE ds = dataloader.shard(ds, input_pipeline_context) @tf.function def _load_wav(filepath, label): file_contents = tf.io.read_file(filepath) # shape: (target_sample_rate, 1) wav, _ = tf.audio.decode_wav(file_contents, desired_channels=1) # shape: (target_sample_rate,) wav = tf.squeeze(wav, axis=-1) # shape: (1, target_sample_rate) wav = tf.expand_dims(wav, 0) return wav, label @tf.function def _crop(waveform, label): # shape: (1, expected_waveform_len) cropped = tf.slice(waveform, begin=[0, 0], size=[1, spec.expected_waveform_len]) return cropped, label @tf.function def _elements_finite(preprocess_data, unused_label): # Make sure that the data sent to the model does not contain nan or inf # values. This should be the last filter applied to the dataset. # Arguably we could possibly apply this filter to all tasks. return tf.math.reduce_all(tf.math.is_finite(preprocess_data)) ds = ds.map(_load_wav, num_parallel_calls=autotune) ds = ds.map(_crop, num_parallel_calls=autotune) ds = ds.map(spec.preprocess, num_parallel_calls=autotune) if is_training: ds = ds.map(spec.data_augmentation, num_parallel_calls=autotune) ds = ds.filter(_elements_finite) if is_training: if shuffle: # Shuffle size should be bigger than the batch_size. Otherwise it's only # shuffling within the batch, which equals to not having shuffle. buffer_size = 3 * batch_size # But since we are doing shuffle before repeat, it doesn't make sense to # shuffle more than total available entries. # TODO(wangtz): Do we want to do shuffle before / after repeat? # Shuffle after repeat will give a more randomized dataset and mix the # epoch boundary: https://www.tensorflow.org/guide/data ds = ds.shuffle(buffer_size=min(self._size, buffer_size)) ds = ds.batch(batch_size, drop_remainder=drop_remainder) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) # TODO(b/171449557): Consider converting ds to distributed ds here. return ds
def gen_dataset(self, batch_size=1, is_training=False, shuffle=False, input_pipeline_context=None, preprocess=None, drop_remainder=False): """Generate a shared and batched tf.data.Dataset for training/evaluation. Args: batch_size: A integer, the returned dataset will be batched by this size. is_training: A boolean, when True, the returned dataset will be optionally shuffled and repeated as an endless dataset. shuffle: A boolean, when True, the returned dataset will be shuffled to create randomness during model training. input_pipeline_context: A InputContext instance, used to shared dataset among multiple workers when distribution strategy is used. preprocess: A function taking three arguments in order, feature, label and boolean is_training. drop_remainder: boolean, whether the finaly batch drops remainder. Returns: A TF dataset ready to be consumed by Keras model. """ # This argument is only used for image dataset for now. Audio preprocessing # is defined in the spec. del preprocess ds = self._dataset spec = self._spec autotune = tf.data.AUTOTUNE options = tf.data.Options() options.experimental_deterministic = False ds = ds.with_options(options) ds = dataloader.shard(ds, input_pipeline_context) @tf.function def _load_wav(filepath, label): file_contents = tf.io.read_file(filepath) # shape: (audio_samples, 1), dtype: float32 wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1) # shape: (audio_samples,) wav = tf.squeeze(wav, axis=-1) return wav, sample_rate, label # This is a eager mode numpy_function. It can be converted to a tf.function # using https://www.tensorflow.org/io/api_docs/python/tfio/audio/resample def _resample_numpy(waveform, sample_rate, label): waveform = librosa.resample( waveform, orig_sr=sample_rate, target_sr=spec.target_sample_rate) return waveform, label @tf.function def _resample(waveform, sample_rate, label): # Short circuit resampling if possible. if sample_rate == spec.target_sample_rate: return [waveform, label] return tf.numpy_function( _resample_numpy, inp=(waveform, sample_rate, label), Tout=[tf.float32, tf.int32]) @tf.function def _elements_finite(preprocess_data, unused_label): # Make sure that the data sent to the model does not contain nan or inf # values. This should be the last filter applied to the dataset. # Arguably we could possibly apply this filter to all tasks. return tf.size(preprocess_data) > 0 and tf.math.reduce_all( tf.math.is_finite(preprocess_data)) ds = ds.map(_load_wav, num_parallel_calls=autotune) ds = ds.map(_resample, num_parallel_calls=autotune) ds = spec.preprocess_ds(ds, is_training=is_training) ds = ds.filter(_elements_finite) if is_training: if shuffle: # Shuffle size should be bigger than the batch_size. Otherwise it's only # shuffling within the batch, which equals to not having shuffle. buffer_size = 3 * batch_size # But since we are doing shuffle before repeat, it doesn't make sense to # shuffle more than total available entries. # TODO(wangtz): Do we want to do shuffle before / after repeat? # Shuffle after repeat will give a more randomized dataset and mix the # epoch boundary: https://www.tensorflow.org/guide/data ds = ds.shuffle(buffer_size=min(self._size, buffer_size)) ds = ds.batch(batch_size, drop_remainder=drop_remainder) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) # TODO(b/171449557): Consider converting ds to distributed ds here. return ds
def gen_dataset(self, batch_size=1, is_training=False, shuffle=False, input_pipeline_context=None, preprocess=None, drop_remainder=False): """Generate a shared and batched tf.data.Dataset for training/evaluation. Args: batch_size: A integer, the returned dataset will be batched by this size. is_training: A boolean, when True, the returned dataset will be optionally shuffled. Data augmentation, if exists, will also be applied to the returned dataset. shuffle: A boolean, when True, the returned dataset will be shuffled to create randomness during model training. Only applies when `is_training` is set to True. input_pipeline_context: A InputContext instance, used to shared dataset among multiple workers when distribution strategy is used. preprocess: Not in use. drop_remainder: boolean, whether the finaly batch drops remainder. Returns: A TF dataset ready to be consumed by Keras model. """ # This argument is only used for image dataset for now. Audio preprocessing # is defined in the spec. del preprocess ds = self._dataset spec = self._spec autotune = tf.data.AUTOTUNE if is_training and shuffle: options = tf.data.Options() options.experimental_deterministic = False ds = ds.with_options(options) ds = dataloader.shard(ds, input_pipeline_context) @tf.function def _load_wav(filepath, label): file_contents = tf.io.read_file(filepath) # shape: (audio_samples, 1), dtype: float32 wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1) # shape: (audio_samples,) wav = tf.squeeze(wav, axis=-1) return wav, sample_rate, label # This is a eager mode numpy_function. It can be converted to a tf.function # using https://www.tensorflow.org/io/api_docs/python/tfio/audio/resample def _resample_numpy(waveform, sample_rate, label): if ENABLE_RESAMPLE: waveform = librosa.resample( waveform, orig_sr=sample_rate, target_sr=spec.target_sample_rate) else: error_message = ( 'Failed to import librosa. You might be missing sndfile, which ' 'can be installed via `sudo apt-get install libsndfile1` on ' 'Ubuntu/Debian.') raise RuntimeError(error_message) from error_import_librosa return waveform, label @tf.function def _resample(waveform, sample_rate, label): # Short circuit resampling if possible. if sample_rate == spec.target_sample_rate: return [waveform, label] return tf.numpy_function( _resample_numpy, inp=(waveform, sample_rate, label), Tout=[tf.float32, tf.int32]) @tf.function def _elements_finite(preprocess_data, unused_label): # Make sure that the data sent to the model does not contain nan or inf # values. This should be the last filter applied to the dataset. # Arguably we could possibly apply this filter to all tasks. return tf.size(preprocess_data) > 0 and tf.math.reduce_all( tf.math.is_finite(preprocess_data)) ds = ds.map(_load_wav, num_parallel_calls=autotune) ds = ds.map(_resample, num_parallel_calls=autotune) def _cache_fn(dataset): if self._cache: if isinstance(self._cache, str): # Cache to a file dataset = dataset.cache(self._cache) else: # In ram cache. dataset = dataset.cache() return dataset # `preprocess_ds` contains data augmentation, so it knows when it's the best # time to do caching. ds = spec.preprocess_ds(ds, is_training=is_training, cache_fn=_cache_fn) ds = ds.filter(_elements_finite) # Apply one-hot encoding after caching to reduce the cache size. @tf.function def _one_hot_encoding_label(wav, label): return wav, tf.one_hot(label, len(self.index_to_label)) ds = ds.map(_one_hot_encoding_label, num_parallel_calls=autotune) # Shuffle needs to be done after caching to create randomness across epochs. if is_training: if shuffle: # Shuffle size should be bigger than the batch_size. Otherwise it's only # shuffling within the batch, which equals to not having shuffle. buffer_size = 3 * batch_size # But since we are doing shuffle before repeat, it doesn't make sense to # shuffle more than total available entries. # TODO(wangtz): Do we want to do shuffle before / after repeat? # Shuffle after repeat will give a more randomized dataset and mix the # epoch boundary: https://www.tensorflow.org/guide/data ds = ds.shuffle(buffer_size=min(self._size, buffer_size)) ds = ds.batch(batch_size, drop_remainder=drop_remainder) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) # TODO(b/171449557): Consider converting ds to distributed ds here. return ds