def cross_entropy(logits, labels, input_length=None, label_length=None, smoothing=0.0, reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS): ''' cross entropy function for classfication and seq classfication :param, label_length, for seq task, this for target seq length, e.g. a b c </s>, 4 ''' del input_length onehot_labels = tf.cond(pred=tf.equal( tf.rank(logits) - tf.rank(labels), 1), true_fn=lambda: tf.one_hot( labels, tf.shape(logits)[-1], dtype=tf.int32), false_fn=lambda: labels) if label_length is not None: weights = utils.len_to_mask(label_length) else: weights = 1.0 loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits, weights=weights, label_smoothing=smoothing, reduction=reduction) return loss
def test_generate_data(self): for batch_mode in [True, False]: task_name = self.config['data']['task']['name'] self.config['data']['task']['batch_mode'] = batch_mode self.config['data']['task']['dummy'] = False task = registers.task[task_name](self.config, self.mode) with self.cached_session(use_gpu=False, force_gpu=False): for uttid, feats, src_lens, targets, tgt_lens in task.generate_data( ): logging.debug('uttid : {}'.format(uttid)) logging.debug("feats : {}, shape : {}".format( feats, feats.shape)) logging.debug("targets : {}, shape : {}".format( targets, targets.shape)) logging.debug('src_len : {}'.format(src_lens)) logging.debug('tgt_len : {}'.format(tgt_lens)) self.assertDTypeEqual(feats, np.float32) self.assertDTypeEqual(src_lens, np.int64) self.assertDTypeEqual(targets, np.int64) self.assertDTypeEqual(tgt_lens, np.int64) if batch_mode: self.assertEqual(len(uttid.shape), 1) self.assertEqual(len(feats.shape), 3) self.assertEqual(len(targets.shape), 2) self.assertEqual(len(src_lens.shape), 1) self.assertEqual(len(tgt_lens.shape), 1) else: self.assertEqual(tf.rank(uttid).numpy(), 0) self.assertEqual(len(feats.shape), 2) self.assertEqual(len(targets.shape), 1) self.assertEqual(tf.rank(src_lens).numpy(), 0) self.assertEqual(tf.rank(tgt_lens).numpy(), 0)
def test_detla_delta(self): ''' test delta delta''' with self.cached_session(use_gpu=False, force_gpu=False): feat = tf.constant(self.data[None, :], dtype=tf.float32) output = py_x_ops.delta_delta(feat, order=self.order, window=self.window) self.assertEqual(tf.rank(output).eval(), tf.rank(feat).eval()) self.assertEqual(output.shape, (1, self.feat_dim * (self.order + 1))) self.assertAllClose(output.eval(), self.output_true[None, :])
def test_framepow(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) input_data = input_data / 32768 framepow = Framepow.params({ 'window_length': 0.025, 'frame_length': 0.010 }).instantiate() framepow_test = framepow(input_data, sample_rate) output_true = np.array([ 0.000018, 0.000011, 0.000010, 0.000010, 0.000010, 0.000010, 0.000008, 0.000009, 0.000009, 0.000009, 0.000009, 0.000011, 0.090164, 0.133028, 0.156547, 0.053551, 0.056670, 0.097706, 0.405659, 2.119505, 4.296845, 6.139090, 6.623638, 6.136467, 7.595072, 7.904415, 7.655983, 6.771016, 5.706427, 4.220942, 3.259599, 2.218259, 1.911394, 2.234246, 3.056905, 2.534153, 0.464354, 0.013493, 0.021231, 0.148362, 0.364829, 0.627266, 0.494912, 0.366029, 0.315408, 0.312441, 0.323796, 0.267505, 0.152856, 0.045305 ]) self.assertEqual(tf.rank(framepow_test).eval(), 1) self.assertAllClose(framepow_test.eval().flatten()[:50], output_true)
def test_sfb(self): ''' test sfb op''' with self.cached_session(use_gpu=False, force_gpu=False): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) power_spc, phase_spc = py_x_ops.analyfiltbank( input_data, sample_rate) logging.info('Shape of power_spc: {}'.format( power_spc.eval().shape)) logging.info('Shape of phase_spc: {}'.format( phase_spc.eval().shape)) output = py_x_ops.synthfiltbank(power_spc.eval(), phase_spc.eval(), sample_rate) self.assertEqual(tf.rank(output).eval(), 1) logging.info('Shape of recovered signal: {}'.format( output.eval().shape)) # beginning 400 samples are different, due to the overlap and add self.assertAllClose(output.eval().flatten()[500:550], input_data[500:550], rtol=1e-4, atol=1e-4)
def test_fbank(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) config = { 'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010, 'snip_edges': True } fbank = Fbank.params(config).instantiate() fbank_test = fbank(input_data, sample_rate) self.assertEqual(tf.rank(fbank_test).eval(), 3) real_fank_feats = np.array( [[3.768338, 4.946218, 6.289874, 6.330853, 6.761764, 6.884573], [3.803553, 5.450971, 6.547878, 5.796172, 6.397846, 7.242926]]) self.assertAllClose( np.squeeze(fbank_test.eval()[0:2, 0:6, 0]), real_fank_feats, rtol=1e-05, atol=1e-05)
def accuracy(logits, labels): ''' accuracy candies params: logits: [B, ..., D] labels: [B, ...] return: accuracy tensor ''' with tf.name_scope('accuracy'): assert_rank = tf.assert_equal(tf.rank(logits), tf.rank(labels) + 1) assert_shape = tf.assert_equal(tf.shape(logits)[:-1], tf.shape(labels)) with tf.control_dependencies([assert_rank, assert_shape]): predictions = tf.argmax(logits, axis=-1, output_type=tf.int64) labels = tf.cast(labels, tf.int64) return tf.reduce_mean( tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
def test_spectrum(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) pitch = Pitch.params({ 'window_length': 0.025, 'soft_min_f0': 10.0 }).instantiate() pitch_test = pitch(input_data, sample_rate) self.assertEqual(tf.rank(pitch_test).eval(), 2) output_true = np.array( [ [0.03881124, 0.3000031, - 0.02324523], [0.006756478, 0.3000097, 0.01047742], [0.02455365, 0.3000154, 0.00695902], [0.02453586, 0.3000221, 0.008448198], [0.03455311, 0.3000307, - 0.07547269], [0.04293294, 0.3000422, - 0.04193667] ] )
def test_plp(self): wav_path = str( Path(PACKAGE_ROOT_DIR).joinpath('layers/ops/data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) input_data = input_data / 32768 plp = Plp.params({ 'window_length': 0.025, 'frame_length': 0.010, 'plp_order': 12 }).instantiate() plp_test = plp(input_data, sample_rate) output_true = np.array( [[-0.209490, -0.326126, 0.010536, -0.027167, -0.117118], [-0.020293, -0.454695, -0.104243, 0.001560, -0.234854], [-0.015118, -0.444044, -0.156695, -0.086221, -0.319310], [-0.031856, -0.130708, 0.047435, -0.089916, -0.160247], [0.052763, -0.271487, 0.011329, 0.025320, 0.012851]]) self.assertEqual(tf.rank(plp_test).eval(), 2) # Because the povey window is used instead of the hamming window in spectrum. self.assertAllClose( plp_test.eval()[50:55, 5:10], output_true, rtol=1e-02, atol=1e-02)
def ctc_lambda_loss(logits, labels, input_length, label_length, blank_index=0): ''' ctc loss function psram: logits, (B, T, D) psram: input_length, (B, 1), input length of encoder psram: labels, (B, T) psram: label_length, (B, 1), label length for convert dense label to sparse returns: loss, scalar ''' ilen = tf.cond( pred=tf.equal(tf.rank(input_length), 1), true_fn=lambda: input_length, false_fn=lambda: tf.squeeze(input_length), ) ilen = tf.cast(ilen, tf.int32) olen = tf.cond( pred=tf.equal(tf.rank(label_length), 1), true_fn=lambda: label_length, false_fn=lambda: tf.squeeze(label_length)) olen = tf.cast(olen, tf.int32) deps = [ tf.assert_rank(labels, 2, name='label_rank_check'), tf.assert_rank(logits, 3, name='logits_rank_check'), tf.assert_rank(ilen, 1, name='src_len_rank_check'), # input_length tf.assert_rank(olen, 1, name='tgt_len_rank_check'), # output_length ] labels, logits = ctc_data_transform(labels, logits, blank_index) with tf.control_dependencies(deps): # (B, 1) # blank index is consistent with Espnet, zero batch_loss = tf.nn.ctc_loss( labels=labels, inputs=logits, sequence_length=ilen, time_major=False, preprocess_collapse_repeated=False, ctc_merge_repeated=True, ignore_longer_outputs_than_inputs=False) return batch_loss
def test_FbankPitch(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) config = {'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010} fbank_pitch = FbankPitch.params(config).instantiate() fbank_pitch_test = fbank_pitch(input_data, sample_rate) self.assertEqual(tf.rank(fbank_pitch_test).eval(), 2) print(fbank_pitch_test.eval()[0:2])
def test_fbank(self): wav_path = str( Path(PACKAGE_ROOT_DIR).joinpath( 'layers/ops/data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) config = {'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010} fbank = Fbank.params(config).instantiate() fbank_test = fbank(input_data, sample_rate) self.assertEqual(tf.rank(fbank_test).eval(), 3)
def compute_mel_filterbank_features(waveforms, sample_rate=16000, preemphasis=0.97, frame_length=0.025, frame_step=0.010, fft_length=None, lower_edge_hertz=80.0, upper_edge_hertz=7600.0, num_mel_bins=80, log_noise_floor=1e-3, apply_mask=True): """Implement mel-filterbank extraction using tf ops. Args: waveforms: float32 tensor with shape [max_len, nchannels] sample_rate: sampling rate of the waveform preemphasis: waveform high-pass filtering constant frame_length: frame length in ms frame_step: frame_Step in ms fft_length: number of fft bins lower_edge_hertz: lowest frequency of the filterbank upper_edge_hertz: highest frequency of the filterbank num_mel_bins: filterbank size log_noise_floor: clip small values to prevent numeric overflow in log apply_mask: When working on a batch of samples, set padding frames to zero Returns: filterbanks: a float32 tensor with shape [nchannles, max_len, num_bins] """ del log_noise_floor, apply_mask spectrogram = powspec_feat(waveforms, sr=sample_rate, nfft=512 if not fft_length else fft_length, winlen=frame_length, winstep=frame_step, lowfreq=lower_edge_hertz, highfreq=upper_edge_hertz, preemph=preemphasis) # [channels, time, feat_dim] fbank = fbank_feat(spectrogram, sr=sample_rate, feature_size=num_mel_bins, nfft=512 if not fft_length else fft_length, lowfreq=lower_edge_hertz, highfreq=upper_edge_hertz) # [time, feat_dim] fbank = tf.cond(tf.equal(tf.rank(fbank), 3), true_fn=lambda: fbank[0, :, :], false_fn=lambda: fbank) return fbank
def delta_delta(feat, order=2): ''' params: feat: a tensor of shape [nframe, nfbank] or [nframe, nfbank, 1] return: [nframe, nfbank, 3] ''' feat = tf.cond(tf.equal(tf.rank(feat), 3), true_fn=lambda: feat[:, :, 0], false_fn=lambda: feat) shape = tf.shape(feat) # [nframe nfbank*3] nframe = shape[0] nfbank = shape[1] delta = py_x_ops.delta_delta(feat, order=order) feat_with_delta_delta = tf.reshape(delta, (nframe, nfbank, (order + 1))) return feat_with_delta_delta
def test_FbankPitch(self): wav_path = str( Path(os.environ['MAIN_ROOT']).joinpath( 'delta/layers/ops/data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) config = { 'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010, 'thres_autoc': 0.4 } fbank_pitch = FbankPitch.params(config).instantiate() fbank_pitch_test = fbank_pitch(input_data) self.assertEqual(tf.rank(fbank_pitch_test).eval(), 2)
def test_framepow(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) framepow = Framepow.params({ 'window_length': 0.025, 'frame_length': 0.010 }).instantiate() framepow_test = framepow(input_data, sample_rate) real_framepow_feats = np.array( [9.819611, 9.328745, 9.247337, 9.26451, 9.266059]) self.assertEqual(tf.rank(framepow_test).eval(), 1) self.assertAllClose(framepow_test.eval()[0:5], real_framepow_feats)
def test_cepstrum(self): ''' test cepstrum op''' with self.cached_session(use_gpu=False, force_gpu=False): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.cepstrum(input_data, sample_rate) #pylint: disable=bad-whitespace output_true = np.array( [[0.525808, 0.579537, 0.159656, 0.014726, -0.1866810], [0.225988, 1.557304, 3.381828, 0.132935, 0.7128600], [-1.832759, -1.045178, 0.753158, 0.116107, -0.9307780], [-0.696277, 1.333355, 1.590942, 2.041829, -0.0805630], [-0.377375, 2.984320, 0.036302, 3.676640, 1.1709290]]) #pylint: enable=bad-whitespace self.assertEqual(tf.rank(output).eval(), 2) logging.info('Shape of cepstrum: {}'.format(output.shape)) self.assertAllClose(output.eval()[15:20, 7:12], output_true)
def test_fbank(self): ''' test fbank op''' with self.cached_session(use_gpu=False, force_gpu=False): data = np.arange(513) spectrogram = tf.constant(data[None, None, :], dtype=tf.float32) sample_rate = tf.constant(22050, tf.int32) output = py_x_ops.fbank(spectrogram, sample_rate, filterbank_channel_count=20) output_true = np.array([ 1.887894, 2.2693727, 2.576507, 2.8156495, 3.036504, 3.2296343, 3.4274294, 3.5987632, 3.771217, 3.937401, 4.0988584, 4.2570987, 4.4110703, 4.563661, 4.7140336, 4.8626432, 5.009346, 5.1539173, 5.2992935, 5.442024 ]) self.assertEqual(tf.rank(output).eval(), 3) self.assertEqual(output.shape, (1, 1, 20)) self.assertAllClose(output.eval(), output_true[None, None, :])
def test_frmpow(self): ''' test frame_power op''' with self.cached_session(use_gpu=False, force_gpu=False): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.frame_pow(input_data, sample_rate) output_true = np.array([ 0.000018, 0.000011, 0.000010, 0.000010, 0.000010, 0.000010, 0.000008, 0.000009, 0.000009, 0.000009, 0.000009, 0.000011, 0.090164, 0.133028, 0.156547, 0.053551, 0.056670, 0.097706, 0.405659, 2.119505, 4.296845, 6.139090, 6.623638, 6.136467, 7.595072, 7.904415, 7.655983, 6.771016, 5.706427, 4.220942, 3.259599, 2.218259, 1.911394, 2.234246, 3.056905, 2.534153, 0.464354, 0.013493, 0.021231, 0.148362, 0.364829, 0.627266, 0.494912, 0.366029, 0.315408, 0.312441, 0.323796, 0.267505, 0.152856, 0.045305 ]) self.assertEqual(tf.rank(output).eval(), 1) logging.info('Shape of frame_power: {}'.format(output.eval().shape)) self.assertAllClose(output.eval().flatten()[:50], output_true)
def test_spectrum(self): wav_path = str( Path(PACKAGE_ROOT_DIR).joinpath('layers/ops/data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) spectrum = Spectrum.params({'window_length': 0.025}).instantiate() spectrum_test = spectrum(input_data, sample_rate) output_true = np.array( [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686], [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851], [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926], [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091], [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]]) self.assertEqual(tf.rank(spectrum_test).eval(), 2) self.assertAllClose(spectrum_test.eval()[4:9, 4:9], output_true)
def test_plp(self): ''' test plp op''' with self.cached_session(use_gpu=False, force_gpu=False): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.plp(input_data, sample_rate) #pylint: disable=bad-whitespace output_true = np.array( [[-0.209490, -0.326126, 0.010536, -0.027167, -0.117118], [-0.020293, -0.454695, -0.104243, 0.001560, -0.234854], [-0.015118, -0.444044, -0.156695, -0.086221, -0.319310], [-0.031856, -0.130708, 0.047435, -0.089916, -0.160247], [0.052763, -0.271487, 0.011329, 0.025320, 0.012851]]) #pylint: enable=bad-whitespace self.assertEqual(tf.rank(output).eval(), 2) logging.info('Shape of PLP: {}'.format(output.shape)) self.assertAllClose( output.eval()[50:55, 5:10], output_true, rtol=1e-05, atol=1e-05)
def test_mfcc(self): wav_path = str( Path(PACKAGE_ROOT_DIR).joinpath('layers/ops/data/sm1_cln.wav')) with self.session(): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) # config = {} mfcc = Mfcc.params().instantiate() mfcc_test = mfcc(input_data, sample_rate) self.assertEqual(tf.rank(mfcc_test).eval(), 3) real_mfcc_feats = np.array( [[-30.58736, -7.088838, -10.67966, -1.646479, -4.36086], [-30.73371, -6.128432, -7.930599, 3.208357, -1.086456]]) self.assertAllClose(np.squeeze(mfcc_test.eval()[0, 0:2, 1:6]), real_mfcc_feats, rtol=1e-05, atol=1e-05)
def test_fbank(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) config = { 'window_length': 0.025, 'output_type': 1, 'frame_length': 0.010, 'snip_edges': True } fbank = Fbank.params(config).instantiate() fbank_test = fbank(input_data, sample_rate) self.assertEqual(tf.rank(fbank_test).eval(), 3) if tf.executing_eagerly(): print(fbank_test.numpy()[0:2, 0:6, 0]) else: print(fbank_test.eval()[0:2, 0:6, 0])
def test_spectrum(self): wav_path = str( Path(PACKAGE_ROOT_DIR).joinpath('layers/ops/data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) spectrum = Spectrum.params({ 'window_length': 0.025, 'snip_edges': 1 }).instantiate() spectrum_test = spectrum(input_data, sample_rate) output_true = np.array( [[9.819611, 2.84503, 3.660894, 2.7779, 1.212233], [9.328745, 2.553949, 3.276319, 3.000918, 2.499342]]) self.assertEqual(tf.rank(spectrum_test).eval(), 2) self.assertAllClose( spectrum_test.eval()[0:2, 0:5], output_true, rtol=1e-05, atol=1e-05)
def test_spectrum(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) pitch = Pitch.params({ 'window_length': 0.025, 'soft_min_f0': 10.0 }).instantiate() pitch_test = pitch(input_data, sample_rate) self.assertEqual(tf.rank(pitch_test).eval(), 2) output_true = [[-0.1366025, 143.8855], [-0.0226383, 143.8855], [-0.08464742, 143.8855], [-0.08458386, 143.8855], [-0.1208689, 143.8855]] self.assertAllClose( pitch_test.eval()[0:5, :], output_true, rtol=1e-05, atol=1e-05)
def test_pitch(self): ''' test pitch op''' with self.cached_session(use_gpu=False, force_gpu=False): # read wave sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.pitch(input_data, sample_rate) output_true = np.array([ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 122.823532, 117.647057, 116.788322, 116.788322, 119.402985, 119.402985, 119.402985, 119.402985, 119.402985, 123.076920, 124.031006, 125.000000, 132.065216, 139.130432, 139.130432, 137.931030, 126.108368, 114.285713, 115.107910, 122.070084, 129.032257, 130.081299, 130.081299, 129.032257, 130.081299, 131.147537, 129.032257, 125.000000, 120.300751, 115.107910 ]) self.assertEqual(tf.rank(output).eval(), 1) logging.info('Shape of pitch: {}'.format(output.eval().shape)) self.assertAllClose(output.eval().flatten()[:50], output_true)
def test_zcr(self): ''' test zcr op''' with self.cached_session(use_gpu=False, force_gpu=False): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) output = py_x_ops.zcr(input_data, sample_rate) output_true = np.array([ 0.406250, 0.418750, 0.425000, 0.407500, 0.393750, 0.392500, 0.388750, 0.417500, 0.427500, 0.456250, 0.447500, 0.386250, 0.357500, 0.282500, 0.232500, 0.262500, 0.282500, 0.295000, 0.220000, 0.157500, 0.125000, 0.107500, 0.100000, 0.092500, 0.092500, 0.095000, 0.097500, 0.105000, 0.100000, 0.112500, 0.120000, 0.132500, 0.130000, 0.135000, 0.112500, 0.120000, 0.090000, 0.080000, 0.070000, 0.080000, 0.087500, 0.092500, 0.097500, 0.097500, 0.112500, 0.090000, 0.065000, 0.087500, 0.175000, 0.240000 ]) self.assertEqual(tf.rank(output).eval(), 1) logging.info('Shape of zero-cross-rate: {}'.format( output.eval().shape)) self.assertAllClose(output.eval().flatten()[:50], output_true)
def test_mfcc(self): wav_path = str(Path(PACKAGE_OPS_DIR).joinpath('data/sm1_cln.wav')) with self.cached_session(use_gpu=False, force_gpu=False): read_wav = ReadWav.params().instantiate() input_data, sample_rate = read_wav(wav_path) config = {'use_energy': True} mfcc = Mfcc.params(config).instantiate() mfcc_test = mfcc(input_data, sample_rate) self.assertEqual(tf.rank(mfcc_test).eval(), 3) real_mfcc_feats = np.array([[ 9.819611, -30.58736, -7.088838, -10.67966, -1.646479, -4.36086 ], [ 9.328745, -30.73371, -6.128432, -7.930599, 3.208357, -1.086456 ]]) self.assertAllClose(np.squeeze(mfcc_test.eval()[0, 0:2, 0:6]), real_mfcc_feats, rtol=1e-05, atol=1e-05)
def test_spectrum(self): ''' test spectrum op''' with self.cached_session(use_gpu=False, force_gpu=False): sample_rate, input_data = feat_lib.load_wav(self.wavpath, sr=16000) logging.info( f"input shape: {input_data.shape}, sample rate dtype: {sample_rate.dtype}" ) self.assertEqual(sample_rate, 16000) output = py_x_ops.spectrum(input_data, sample_rate) #pylint: disable=bad-whitespace output_true = np.array( [[-16.863441, -16.910473, -17.077059, -16.371634, -16.845686], [-17.922068, -20.396345, -19.396944, -17.331493, -16.118851], [-17.017776, -17.551350, -20.332376, -17.403994, -16.617926], [-19.873854, -17.644503, -20.679525, -17.093716, -16.535091], [-17.074402, -17.295971, -16.896650, -15.995432, -16.560730]]) #pylint: enable=bad-whitespace self.assertEqual(tf.rank(output).eval(), 2) logging.info('Shape of spectrum: {}'.format(output.shape)) self.assertAllClose(output.eval()[4:9, 4:9], output_true)
def fbank_feat(powspec, sr=8000, feature_size=40, nfft=512, lowfreq=0, highfreq=None): ''' powspec: [audio_channels, spectrogram_length, spectrogram_feat_dim] return : [auido_chnnels, nframe, nfbank] ''' del nfft true_fn = lambda: tf.expand_dims(powspec, 0) false_fn = lambda: powspec powspec = tf.cond(tf.equal(tf.rank(powspec), 2), true_fn, false_fn) feat = py_x_ops.fbank( powspec, sr, filterbank_channel_count=feature_size, lower_frequency_limit=lowfreq, upper_frequency_limit=highfreq, ) return feat