def preprocess_example(self, example, mode, hparams): p = hparams if p.audio_preproc_in_bottom: example["inputs"] = tf.expand_dims( tf.expand_dims(example["waveforms"], -1), -1) else: waveforms = tf.expand_dims(example["waveforms"], 0) mel_fbanks = common_audio.compute_mel_filterbank_features( waveforms, sample_rate=p.audio_sample_rate, dither=p.audio_dither, preemphasis=p.audio_preemphasis, frame_length=p.audio_frame_length, frame_step=p.audio_frame_step, lower_edge_hertz=p.audio_lower_edge_hertz, upper_edge_hertz=p.audio_upper_edge_hertz, num_mel_bins=p.audio_num_mel_bins, apply_mask=False) if p.audio_add_delta_deltas: mel_fbanks = common_audio.add_delta_deltas(mel_fbanks) fbank_size = common_layers.shape_list(mel_fbanks) assert fbank_size[0] == 1 # This replaces CMVN estimation on data var_epsilon = 1e-09 mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1) variance = tf.reduce_mean(tf.square(mel_fbanks - mean), keepdims=True, axis=1) mel_fbanks = (mel_fbanks - mean) * tf.rsqrt(variance + var_epsilon) ######### specaugment added by kyubyong ######### if mode == tf.estimator.ModeKeys.TRAIN: # mel_fbanks = time_warp(mel_fbanks) mel_fbanks = freq_mask(mel_fbanks) mel_fbanks = time_mask(mel_fbanks) ######### /specaugment added by kyubyong ######### # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. example["inputs"] = tf.concat([ tf.reshape(mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]), tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3])) ], 0) if not p.audio_keep_example_waveforms: del example["waveforms"] return super(SpeechRecognitionProblem, self).preprocess_example(example, mode, hparams)
def preprocess_example(self, example, mode, hparams): p = hparams if p.audio_preproc_in_bottom: example["inputs"] = tf.expand_dims( tf.expand_dims(example["waveforms"], -1), -1) else: waveforms = tf.expand_dims(example["waveforms"], 0) mel_fbanks = common_audio.compute_mel_filterbank_features( waveforms, sample_rate=p.audio_sample_rate, dither=p.audio_dither, preemphasis=p.audio_preemphasis, frame_length=p.audio_frame_length, frame_step=p.audio_frame_step, lower_edge_hertz=p.audio_lower_edge_hertz, upper_edge_hertz=p.audio_upper_edge_hertz, num_mel_bins=p.audio_num_mel_bins, apply_mask=False) if p.audio_add_delta_deltas: mel_fbanks = common_audio.add_delta_deltas(mel_fbanks) fbank_size = common_layers.shape_list(mel_fbanks) assert fbank_size[0] == 1 # This replaces CMVN estimation on data var_epsilon = 1e-09 mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1) variance = tf.reduce_mean(tf.square(mel_fbanks - mean), keepdims=True, axis=1) mel_fbanks = (mel_fbanks - mean) * tf.rsqrt(variance + var_epsilon) # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. example["inputs"] = tf.concat([ tf.reshape(mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]), tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3]))], 0) if not p.audio_keep_example_waveforms: del example["waveforms"] return super(SpeechRecognitionProblem, self ).preprocess_example(example, mode, hparams)
def bottom(self, x): """Use batchnorm instead of CMVN and shorten the stft with strided convs. Args: x: float32 tensor with shape [batch_size, len, 1, freqs * channels] Returns: float32 tensor with shape [batch_size, shorter_len, 1, hidden_size] """ inputs = x p = self._model_hparams num_mel_bins = p.audio_num_mel_bins num_channels = 3 if p.audio_add_delta_deltas else 1 with tf.variable_scope(self.name): if p.audio_preproc_in_bottom: # Compute filterbanks with tf.variable_scope("fbanks"): waveforms = tf.squeeze(inputs, [2, 3]) mel_fbanks = common_audio.compute_mel_filterbank_features( waveforms, sample_rate=p.audio_sample_rate, dither=p.audio_dither, preemphasis=p.audio_preemphasis, frame_length=p.audio_frame_length, frame_step=p.audio_frame_step, lower_edge_hertz=p.audio_lower_edge_hertz, upper_edge_hertz=p.audio_upper_edge_hertz, num_mel_bins=p.audio_num_mel_bins, apply_mask=True) if p.audio_add_delta_deltas: mel_fbanks = common_audio.add_delta_deltas(mel_fbanks) x = tf.reshape( mel_fbanks, common_layers.shape_list(mel_fbanks)[:2] + [num_mel_bins, num_channels]) nonpadding_mask = 1. - common_attention.embedding_to_padding( x) num_of_nonpadding_elements = tf.reduce_sum( nonpadding_mask) * num_mel_bins * num_channels # This replaces CMVN estimation on data var_epsilon = 1e-09 mean = tf.reduce_sum(x, axis=[ 1 ], keepdims=True) / num_of_nonpadding_elements variance = ( num_of_nonpadding_elements * mean**2. - 2. * mean * tf.reduce_sum(x, axis=[1], keepdims=True) + tf.reduce_sum(x**2, axis=[1], keepdims=True) ) / num_of_nonpadding_elements x = (x - mean) * tf.rsqrt(variance + var_epsilon) * tf.expand_dims( nonpadding_mask, -1) else: x = inputs # The convention is that the models are flattened along the spatial, # dimensions, thus the speech preprocessor treats frequencies and # channels as image colors (last axis) x.set_shape([None, None, num_mel_bins, num_channels]) # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding? x = tf.pad(x, [[0, 0], [0, 8], [0, 0], [0, 0]]) for _ in range(2): x = tf.layers.conv2d(x, 128, (3, 3), (2, 2), use_bias=False) x = common_layers.layer_norm(x) x = tf.nn.relu(x) xshape = common_layers.shape_list(x) # apply a conv that will remove all frequencies and at the same time # project the output into desired hidden_size x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]]) x = tf.layers.conv2d(x, p.hidden_size, (3, xshape[2]), use_bias=False) assert common_layers.shape_list(x)[2] == 1 x = common_layers.layer_norm(x) x = tf.nn.relu(x) return x