def preprocess_example(self, example, mode, hparams): p = hparams if p.audio_preproc_in_bottom: example["inputs"] = tf.expand_dims( tf.expand_dims(example["waveforms"], -1), -1) else: waveforms = tf.expand_dims(example["waveforms"], 0) mel_fbanks = common_audio.compute_mel_filterbank_features( waveforms, sample_rate=p.audio_sample_rate, dither=p.audio_dither, preemphasis=p.audio_preemphasis, frame_length=p.audio_frame_length, frame_step=p.audio_frame_step, lower_edge_hertz=p.audio_lower_edge_hertz, upper_edge_hertz=p.audio_upper_edge_hertz, num_mel_bins=p.audio_num_mel_bins, apply_mask=False) if p.audio_add_delta_deltas: mel_fbanks = common_audio.add_delta_deltas(mel_fbanks) fbank_size = common_layers.shape_list(mel_fbanks) assert fbank_size[0] == 1 # This replaces CMVN estimation on data var_epsilon = 1e-09 mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1) variance = tf.reduce_mean(tf.square(mel_fbanks - mean), keepdims=True, axis=1) mel_fbanks = (mel_fbanks - mean) * tf.rsqrt(variance + var_epsilon) ######### specaugment added by kyubyong ######### if mode == tf.estimator.ModeKeys.TRAIN: # mel_fbanks = time_warp(mel_fbanks) mel_fbanks = freq_mask(mel_fbanks) mel_fbanks = time_mask(mel_fbanks) ######### /specaugment added by kyubyong ######### # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. example["inputs"] = tf.concat([ tf.reshape(mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]), tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3])) ], 0) if not p.audio_keep_example_waveforms: del example["waveforms"] return super(SpeechRecognitionProblem, self).preprocess_example(example, mode, hparams)
def preprocess_example(self, example, mode, hparams): p = hparams if p.audio_preproc_in_bottom: example["inputs"] = tf.expand_dims( tf.expand_dims(example["waveforms"], -1), -1) else: waveforms = tf.expand_dims(example["waveforms"], 0) mel_fbanks = common_audio.compute_mel_filterbank_features( waveforms, sample_rate=p.audio_sample_rate, dither=p.audio_dither, preemphasis=p.audio_preemphasis, frame_length=p.audio_frame_length, frame_step=p.audio_frame_step, lower_edge_hertz=p.audio_lower_edge_hertz, upper_edge_hertz=p.audio_upper_edge_hertz, num_mel_bins=p.audio_num_mel_bins, apply_mask=False) if p.audio_add_delta_deltas: mel_fbanks = common_audio.add_delta_deltas(mel_fbanks) fbank_size = common_layers.shape_list(mel_fbanks) assert fbank_size[0] == 1 # This replaces CMVN estimation on data var_epsilon = 1e-09 mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1) variance = tf.reduce_mean(tf.square(mel_fbanks - mean), keepdims=True, axis=1) mel_fbanks = (mel_fbanks - mean) * tf.rsqrt(variance + var_epsilon) # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. example["inputs"] = tf.concat([ tf.reshape(mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]), tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3]))], 0) if not p.audio_keep_example_waveforms: del example["waveforms"] return super(SpeechRecognitionProblem, self ).preprocess_example(example, mode, hparams)
def bottom(self, x): """Use batchnorm instead of CMVN and shorten the stft with strided convs. Args: x: float32 tensor with shape [batch_size, len, 1, freqs * channels] Returns: float32 tensor with shape [batch_size, shorter_len, 1, hidden_size] """ inputs = x p = self._model_hparams num_mel_bins = p.audio_num_mel_bins num_channels = 3 if p.audio_add_delta_deltas else 1 with tf.variable_scope(self.name): if p.audio_preproc_in_bottom: # Compute filterbanks with tf.variable_scope("fbanks"): waveforms = tf.squeeze(inputs, [2, 3]) mel_fbanks = common_audio.compute_mel_filterbank_features( waveforms, sample_rate=p.audio_sample_rate, dither=p.audio_dither, preemphasis=p.audio_preemphasis, frame_length=p.audio_frame_length, frame_step=p.audio_frame_step, lower_edge_hertz=p.audio_lower_edge_hertz, upper_edge_hertz=p.audio_upper_edge_hertz, num_mel_bins=p.audio_num_mel_bins, apply_mask=True) if p.audio_add_delta_deltas: mel_fbanks = common_audio.add_delta_deltas(mel_fbanks) x = tf.reshape( mel_fbanks, common_layers.shape_list(mel_fbanks)[:2] + [num_mel_bins, num_channels]) nonpadding_mask = 1. - common_attention.embedding_to_padding( x) num_of_nonpadding_elements = tf.reduce_sum( nonpadding_mask) * num_mel_bins * num_channels # This replaces CMVN estimation on data var_epsilon = 1e-09 mean = tf.reduce_sum(x, axis=[ 1 ], keepdims=True) / num_of_nonpadding_elements variance = ( num_of_nonpadding_elements * mean**2. - 2. * mean * tf.reduce_sum(x, axis=[1], keepdims=True) + tf.reduce_sum(x**2, axis=[1], keepdims=True) ) / num_of_nonpadding_elements x = (x - mean) * tf.rsqrt(variance + var_epsilon) * tf.expand_dims( nonpadding_mask, -1) else: x = inputs # The convention is that the models are flattened along the spatial, # dimensions, thus the speech preprocessor treats frequencies and # channels as image colors (last axis) x.set_shape([None, None, num_mel_bins, num_channels]) # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding? x = tf.pad(x, [[0, 0], [0, 8], [0, 0], [0, 0]]) for _ in range(2): x = tf.layers.conv2d(x, 128, (3, 3), (2, 2), use_bias=False) x = common_layers.layer_norm(x) x = tf.nn.relu(x) xshape = common_layers.shape_list(x) # apply a conv that will remove all frequencies and at the same time # project the output into desired hidden_size x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]]) x = tf.layers.conv2d(x, p.hidden_size, (3, xshape[2]), use_bias=False) assert common_layers.shape_list(x)[2] == 1 x = common_layers.layer_norm(x) x = tf.nn.relu(x) return x
def process_audio(audio_path, sess, prepro_batch=128, sample_rate=22050, frame_step=10, frame_length=25, feat_dim=40, feat_type='fbank'): """GPU accerated audio features extracting in tensorflow Args: audio_path: List of path of audio files. sess: Tf session to execute the graph for feature extraction. prepro_batch: Batch size for preprocessing audio features. frame_step: Step size in ms. feat_dim: Feature dimension. feat_type: Types of features you want to apply. Returns: feats: List of features with variable length L, each element is in the shape of (L, feat_dim), N is the number of samples. featlen: List of feature length. """ # build extacting graph input_audio = tf.placeholder(dtype=tf.float32, shape=[None, None]) if feat_type == 'fbank': mel_fbanks = common_audio.compute_mel_filterbank_features( input_audio, sample_rate=sample_rate, frame_step=frame_step, frame_length=frame_length, num_mel_bins=feat_dim, apply_mask=True) mel_fbanks = tf.reduce_sum(mel_fbanks, -1) def extract_feat(audio_batch, len_batch, fs): max_len = max(len_batch) audio_padded = np.zeros([prepro_batch, max_len], dtype=np.float32) for i in range(len(audio_batch)): audio_padded[i][:len(audio_batch[i])] = audio_batch[i] feat = sess.run(mel_fbanks, feed_dict={input_audio: audio_padded}) # compute the feature length: feat_len = np.array(len_batch) // int(fs * frame_step / 1e3) + 1 feat_len = feat_len.astype(np.int32) return feat, feat_len audio_batch = [] len_batch = [] feats = [] featlen = [] # start extracting audio feature in a batch manner: for p in audio_path[:13]: audio, fs = librosa.load(p) audio_batch.append(audio) len_batch.append(len(audio)) if len(audio_batch) == prepro_batch: feat, feat_len = extract_feat(audio_batch, len_batch, fs) # remove paddings of audios batch: for index, l in enumerate(feat_len): feats.append(feat[index][:l]) featlen = np.concatenate([featlen, feat_len]) audio_batch = [] len_batch = [] print("Processed samples: {}/{}".format(len(feats), len(audio_path))) if len(audio_batch) % prepro_batch != 0: feat, feat_len = extract_feat(audio_batch, len_batch, fs) # remove paddings: for index, l in enumerate(feat_len): feats.append(feat[index][:l]) featlen = np.concatenate([featlen, feat_len]) print("Processed samples: {}/{}".format(len(feats), len(audio_path))) return np.array(feats), featlen.astype(np.int32)