def call(self, audio_data, sample_rate=None): """ Caculate fbank features of audio data. :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor. :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz. :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing fbank features of every frame in speech. """ p = self.config with tf.name_scope('fbank'): if sample_rate == None: sample_rate = tf.constant(p.sample_rate, dtype=tf.int32) if p.upper_frequency_limit <= 0: p.upper_frequency_limit = p.sample_rate / 2.0 + p.upper_frequency_limit elif (p.upper_frequency_limit <= p.lower_frequency_limit) or ( p.upper_frequency_limit > p.sample_rate / 2.0): p.upper_frequency_limit = p.sample_rate / 2.0 assert_op = tf.assert_equal(tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32)) with tf.control_dependencies([assert_op]): spectrum = self.spect(audio_data, sample_rate) spectrum = tf.expand_dims(spectrum, 0) fbank = py_x_ops.fbank( spectrum, sample_rate, upper_frequency_limit=p.upper_frequency_limit, lower_frequency_limit=p.lower_frequency_limit, filterbank_channel_count=p.filterbank_channel_count) return fbank
def test_fbank(self): """ test fbank op""" with self.session(): data = np.arange(513) spectrogram = tf.constant(data[None, None, :], dtype=tf.float32) sample_rate = tf.constant(22050, tf.int32) output = py_x_ops.fbank(spectrogram, sample_rate, filterbank_channel_count=20) output_true = np.array([ 1.887894, 2.2693727, 2.576507, 2.8156495, 3.036504, 3.2296343, 3.4274294, 3.5987632, 3.771217, 3.937401, 4.0988584, 4.2570987, 4.4110703, 4.563661, 4.7140336, 4.8626432, 5.009346, 5.1539173, 5.2992935, 5.442024, ]) self.assertEqual(tf.rank(output).eval(), 3) self.assertEqual(output.shape, (1, 1, 20)) self.assertAllClose(output.eval(), output_true[None, None, :])
def fbank_feat(powspec, sr=8000, feature_size=40, nfft=512, lowfreq=0, highfreq=None): ''' powspec: [audio_channels, spectrogram_length, spectrogram_feat_dim] return : [auido_chnnels, nframe, nfbank] ''' del nfft true_fn = lambda: tf.expand_dims(powspec, 0) false_fn = lambda: powspec powspec = tf.cond(tf.equal(tf.rank(powspec), 2), true_fn, false_fn) feat = py_x_ops.fbank( powspec, sr, filterbank_channel_count=feature_size, lower_frequency_limit=lowfreq, upper_frequency_limit=highfreq, ) return feat