def test_mfcc_correctness(data_format, n_mfccs): src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1) melgram = librosa.power_to_db(librosa.feature.melspectrogram(src_mono)) # mel, time mfcc_ref = librosa.feature.mfcc( S=melgram, n_mfcc=n_mfccs, norm='ortho' ) # 'ortho' -> 5% mismatch but.. expand_dim = (0, 3) if data_format in (_CH_LAST_STR, _CH_DEFAULT_STR) else (0, 1) melgram_batch = np.expand_dims(melgram.T, expand_dim) model = tf.keras.Sequential() model.add( LogmelToMFCC(n_mfccs=n_mfccs, data_format=data_format, input_shape=melgram_batch.shape[1:]) ) mfcc_kapre = model.predict(melgram_batch) ch_axis = 1 if data_format == _CH_FIRST_STR else 3 mfcc_kapre = np.squeeze(mfcc_kapre, axis=ch_axis) mfcc_kapre = mfcc_kapre[0].T if n_mfccs > 1: np.testing.assert_allclose(mfcc_ref[1:], mfcc_kapre[1:], atol=1e-4) np.testing.assert_allclose(mfcc_ref[0], mfcc_kapre[0] / np.sqrt(2.0), atol=1e-4)
def test_frame_correctness(frame_length, data_format): hop_length = frame_length // 2 n_ch = 1 src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch, length=1000) model = tf.keras.Sequential() model.add( Frame( frame_length=frame_length, hop_length=hop_length, pad_end=False, data_format=data_format, input_shape=input_shape, ) ) frames_ref = librosa.util.frame(src_mono, frame_length, hop_length).T # (time, frame_length) if data_format in (_CH_DEFAULT_STR, _CH_LAST_STR): frames_ref = np.expand_dims(frames_ref, axis=2) else: frames_ref = np.expand_dims(frames_ref, axis=0) frames_kapre = model.predict(batch_src)[0] np.testing.assert_equal(frames_kapre, frames_ref)
def test_energy_correctness(data_format): frame_length = 4 hop_length = frame_length // 2 n_ch = 1 src_mono, batch_src, input_shape = get_audio( data_format=data_format, n_ch=n_ch, length=frame_length * 2 ) sr = 22050 ref_duration = 0.1 model = tf.keras.Sequential() model.add( Energy( sample_rate=sr, ref_duration=ref_duration, frame_length=frame_length, hop_length=hop_length, pad_end=False, data_format=data_format, input_shape=input_shape, ) ) energies_kapre = model.predict(batch_src)[0] frames_ref = librosa.util.frame(src_mono, frame_length, hop_length).T # (time, frame_length) nor_coeff = ref_duration / (frame_length / sr) energies_ref = nor_coeff * np.sum(frames_ref ** 2, axis=1) # (time, ) if data_format in (_CH_DEFAULT_STR, _CH_LAST_STR): energies_ref = np.expand_dims(energies_ref, axis=1) else: energies_ref = np.expand_dims(energies_ref, axis=0) np.testing.assert_allclose(energies_kapre, energies_ref, atol=1e-5)
def test_save_load(): """test saving/loading of models that has stft, melspectorgrma, and log frequency.""" src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1) # test STFT save/load save_load_compare(STFT(input_shape=input_shape, pad_begin=True), batch_src, allclose_complex_numbers) # test melspectrogram save/load save_load_compare( get_melspectrogram_layer(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, ) # test log frequency spectrogram save/load save_load_compare( get_log_frequency_spectrogram_layer(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, ) # test stft_mag_phase save_load_compare( get_stft_mag_phase(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, ) # test stft mag save_load_compare(get_stft_magnitude_layer(input_shape=input_shape), batch_src, np.testing.assert_allclose)
def test_save_load(data_format): src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1) # test Frame save/load save_load_compare( Frame(frame_length=128, hop_length=64, input_shape=input_shape), batch_src, np.testing.assert_allclose, ) # test Energy save/load save_load_compare( Energy(frame_length=128, hop_length=64, input_shape=input_shape), batch_src, np.testing.assert_allclose, ) # test mu law layers save_load_compare( MuLawEncoding(quantization_channels=128), batch_src, np.testing.assert_allclose, ) save_load_compare( MuLawDecoding(quantization_channels=128), np.arange(0, 256, 1).reshape((1, 256, 1)), np.testing.assert_allclose, ) # test mfcc layer expand_dim = (0, 3) if data_format in (_CH_LAST_STR, _CH_DEFAULT_STR) else (0, 1) save_load_compare( LogmelToMFCC(n_mfccs=10), np.expand_dims(librosa.power_to_db(librosa.feature.melspectrogram(src_mono).T), expand_dim), np.testing.assert_allclose, )
def test_log_spectrogram_fail(): """test if log spectrogram layer works well""" src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1) _ = get_log_frequency_spectrogram_layer(input_shape, return_decibel=True, log_n_bins=200)
def test_mag_phase(data_format): n_ch = 1 n_fft, hop_length, win_length = 512, 256, 512 src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch) mag_phase_layer = get_stft_mag_phase( input_shape=input_shape, n_fft=n_fft, win_length=win_length, hop_length=hop_length, input_data_format=data_format, output_data_format=data_format, ) model = tensorflow.keras.models.Sequential() model.add(mag_phase_layer) mag_phase_kapre = model(batch_src)[0] # a 2d image shape ch_axis = 0 if data_format == 'channels_first' else 2 # non-batch mag_phase_ref = np.stack( librosa.magphase( librosa.stft( src_mono, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, ).T ), axis=ch_axis, ) np.testing.assert_equal(mag_phase_kapre.shape, mag_phase_ref.shape) # magnitude test np.testing.assert_allclose( np.take(mag_phase_kapre, [0,], axis=ch_axis,), np.take(mag_phase_ref, [0,], axis=ch_axis,), atol=2e-4, )
def placeholder(*args, **kwargs): global root, voices_menu words = [] while True: try: word = input('type text to search:\n') if not word: break words.append(word) except KeyboardInterrupt: break print('fetching...') voices = SelectionMenu([]) i = 0 for text, id in get_audio(words): i += 1 sel = SelectionMenu([], 'Choose phrase') si = SubmenuItem(text[:60], sel, voices) sel.append_item(FunctionItem(f"Play \"{text[:80]}\"", play_wrapper(id))) sel.append_item(FunctionItem(f"Save \"{text[:80]}\"", save_wrapper(id))) voices.append_item(si) voices.title = f'Tip for long outputs: to exit press `1` and arrow up' string = '+'.join(words) submenu_item = SubmenuItem(f'Found {string} voices: {i}', voices, root) root.append_item(submenu_item)
def test_perfectly_reconstructing_stft_istft(waveform_data_format, stft_data_format, hop_ratio): n_ch = 1 src_mono, batch_src, input_shape = get_audio( data_format=waveform_data_format, n_ch=n_ch) time_axis = 1 if waveform_data_format == 'channels_first' else 0 # non-batch! len_src = input_shape[time_axis] n_fft = 2048 hop_length = int(2048 * hop_ratio) n_added_frames = int(1 / hop_ratio) - 1 stft, istft = get_perfectly_reconstructing_stft_istft( stft_input_shape=input_shape, n_fft=n_fft, hop_length=hop_length, waveform_data_format=waveform_data_format, stft_data_format=stft_data_format, ) # Test - [STFT -> ISTFT] model = tf.keras.models.Sequential([stft, istft]) recon_waveform = model(batch_src) # trim off the pad_begin part len_pad_begin = n_fft - hop_length if waveform_data_format == 'channels_first': recon_waveform = recon_waveform[:, :, len_pad_begin:len_pad_begin + len_src] else: recon_waveform = recon_waveform[:, len_pad_begin:len_pad_begin + len_src, :] np.testing.assert_allclose(batch_src, recon_waveform, atol=1e-5) # Test - [ISTFT -> STFT] S = librosa.stft(src_mono, n_fft=n_fft, hop_length=hop_length).T.astype( np.complex64) # (time, freq) ch_axis = 1 if stft_data_format == 'channels_first' else 3 # batch shape S = np.expand_dims(S, (0, ch_axis)) model = tf.keras.models.Sequential([istft, stft]) recon_S = model(S) # trim off the frames coming from zero-pad result n = n_added_frames n_added_frames += n if stft_data_format == 'channels_first': if n != 0: S = S[:, :, n:-n, :] recon_S = recon_S[:, :, n_added_frames:-n_added_frames, :] else: if n != 0: S = S[:, n:-n, :, :] recon_S = recon_S[:, n_added_frames:-n_added_frames, :, :] np.testing.assert_equal(S.shape, recon_S.shape) allclose_complex_numbers(S, recon_S)
def test_spectrogram_correctness_more(data_format, window_name): def _get_stft_model(following_layer=None): # compute with kapre stft_model = tensorflow.keras.models.Sequential() stft_model.add( STFT( n_fft=n_fft, win_length=win_length, hop_length=hop_length, window_name=window_name, pad_end=False, input_data_format=data_format, output_data_format=data_format, input_shape=input_shape, name='stft', ) ) if following_layer is not None: stft_model.add(following_layer) return stft_model n_fft = 512 hop_length = 256 n_ch = 2 src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch) win_length = n_fft # test with x2 # compute with librosa S_ref = librosa.core.stft( src_mono, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, window=window_name.replace('_window', '') if window_name else 'hann', ).T # (time, freq) S_ref = np.expand_dims(S_ref, axis=2) # time, freq, ch=1 S_ref = np.tile(S_ref, [1, 1, n_ch]) # time, freq, ch=n_ch if data_format == 'channels_first': S_ref = np.transpose(S_ref, (2, 0, 1)) # ch, time, freq stft_model = _get_stft_model() S_complex = stft_model.predict(batch_src)[0] # 3d representation allclose_complex_numbers(S_ref, S_complex) # test Magnitude() stft_mag_model = _get_stft_model(Magnitude()) S = stft_mag_model.predict(batch_src)[0] # 3d representation np.testing.assert_allclose(np.abs(S_ref), S, atol=2e-4) # # test Phase() stft_phase_model = _get_stft_model(Phase()) S = stft_phase_model.predict(batch_src)[0] # 3d representation allclose_phase(np.angle(S_complex), S)
def test_save_load(save_format): """test saving/loading of models that has stft, melspectorgrma, and log frequency.""" src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1) # test STFT save/load save_load_compare( STFT(input_shape=input_shape, pad_begin=True), batch_src, allclose_complex_numbers, save_format, STFT, ) # test ConcatenateFrequencyMap specs_batch = np.random.randn(2, 3, 5, 4).astype(np.float32) save_load_compare( ConcatenateFrequencyMap(input_shape=specs_batch.shape[1:]), specs_batch, np.testing.assert_allclose, save_format, ConcatenateFrequencyMap, ) if save_format == 'tf': # test melspectrogram save/load save_load_compare( get_melspectrogram_layer(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, save_format, ) # test log frequency spectrogram save/load save_load_compare( get_log_frequency_spectrogram_layer(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, save_format, ) # test stft_mag_phase save_load_compare( get_stft_mag_phase(input_shape=input_shape, return_decibel=True), batch_src, np.testing.assert_allclose, save_format, ) # test stft mag save_load_compare( get_stft_magnitude_layer(input_shape=input_shape), batch_src, np.testing.assert_allclose, save_format, )
def test_save_load_channel_swap(data_format, save_format): src_mono, batch_src, input_shape = get_audio(data_format='channels_last', n_ch=1) save_load_compare( ChannelSwap(input_shape=input_shape), batch_src, np.testing.assert_allclose, save_format=save_format, layer_class=ChannelSwap, training=None, )
def test_channel_swap_correctness(n_ch, data_format, data_type): len_src = 256 src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch, length=len_src) model = tf.keras.Sequential() model.add(ChannelSwap(input_shape=input_shape, )) # consistent during inference kapre_ref = model.predict(batch_src) for _ in range(100): kapre_again = model.predict(batch_src) np.testing.assert_equal(kapre_ref, kapre_again) ch_axis = 1 if data_format == _CH_FIRST_STR else 2 # to be changed for 2d data type
def test_log_spectrogram_runnable(data_format): """test if log spectrogram layer works well""" src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=1) _ = get_log_frequency_spectrogram_layer(input_shape, return_decibel=True) _ = get_log_frequency_spectrogram_layer(input_shape, return_decibel=False)
def test_melspectrogram_correctness(n_fft, sr, hop_length, n_ch, data_format, amin, dynamic_range, n_mels, mel_f_min, mel_f_max): """Test the correctness of melspectrogram. Note that mel filterbank is tested separated """ def _get_melgram_model(return_decibel, amin, dynamic_range, input_shape=None): # compute with kapre melgram_model = get_melspectrogram_layer( n_fft=n_fft, sample_rate=sr, n_mels=n_mels, mel_f_min=mel_f_min, mel_f_max=mel_f_max, win_length=win_length, hop_length=hop_length, input_data_format=data_format, output_data_format=data_format, return_decibel=return_decibel, input_shape=input_shape, db_amin=amin, db_dynamic_range=dynamic_range, ) return melgram_model src_mono, batch_src, input_shape = get_audio(data_format=data_format, n_ch=n_ch) win_length = n_fft # test with x2 # compute with librosa S_ref = librosa.feature.melspectrogram( src_mono, sr=sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=False, power=1.0, n_mels=n_mels, fmin=mel_f_min, fmax=mel_f_max, ).T S_ref = np.expand_dims(S_ref, axis=2) # time, freq, ch=1 S_ref = np.tile(S_ref, [1, 1, n_ch]) # time, freq, ch=n_ch if data_format == 'channels_first': S_ref = np.transpose(S_ref, (2, 0, 1)) # ch, time, freq # melgram melgram_model = _get_melgram_model(return_decibel=False, input_shape=input_shape, amin=None, dynamic_range=120.0) S = melgram_model.predict(batch_src)[0] # 3d representation np.testing.assert_allclose(S_ref, S, atol=1e-4) # log melgram melgram_model = _get_melgram_model(return_decibel=True, input_shape=input_shape, amin=amin, dynamic_range=dynamic_range) S = melgram_model.predict(batch_src)[0] # 3d representation S_ref_db = librosa.power_to_db(S_ref, ref=1.0, amin=amin, top_db=dynamic_range) np.testing.assert_allclose( S_ref_db, S, rtol=3e-3) # decibel is evaluated with relative tolerance
def test_spectrogram_tflite_correctness( n_fft, hop_length, n_ch, data_format, batch_size, win_length, pad_end ): def _get_stft_model(following_layer=None, tflite_compatible=False): # compute with kapre stft_model = tensorflow.keras.models.Sequential() if tflite_compatible: stft_model.add( STFTTflite( n_fft=n_fft, win_length=win_length, hop_length=hop_length, window_name=None, pad_end=pad_end, input_data_format=data_format, output_data_format=data_format, input_shape=input_shape, name='stft', ) ) else: stft_model.add( STFT( n_fft=n_fft, win_length=win_length, hop_length=hop_length, window_name=None, pad_end=pad_end, input_data_format=data_format, output_data_format=data_format, input_shape=input_shape, name='stft', ) ) if following_layer is not None: stft_model.add(following_layer) return stft_model src_mono, batch_src, input_shape = get_audio( data_format=data_format, n_ch=n_ch, batch_size=batch_size ) # tflite requires a known batch size batch_size = batch_src.shape[0] stft_model_tflite = _get_stft_model(tflite_compatible=True) stft_model = _get_stft_model(tflite_compatible=False) # test STFT() S_complex_tflite = predict_using_tflite(stft_model_tflite, batch_src) # predict using tflite # (batch, time, freq, chan, re/imag) - convert to complex number: S_complex_tflite = tf.complex( S_complex_tflite[..., 0], S_complex_tflite[..., 1] ) # (batch,time,freq,chan) S_complex = stft_model.predict(batch_src) # predict using tf model allclose_complex_numbers(S_complex, S_complex_tflite) # test Magnitude() stft_mag_model_tflite = _get_stft_model(MagnitudeTflite(), tflite_compatible=True) stft_mag_model = _get_stft_model(Magnitude(), tflite_compatible=False) S_lite = predict_using_tflite(stft_mag_model_tflite, batch_src) # predict using tflite S = stft_mag_model.predict(batch_src) # predict using tf model np.testing.assert_allclose(S, S_lite, atol=1e-4) # # test approx Phase() same for tflite and non-tflite stft_approx_phase_model_lite = _get_stft_model( PhaseTflite(approx_atan_accuracy=500), tflite_compatible=True ) stft_approx_phase_model = _get_stft_model( Phase(approx_atan_accuracy=500), tflite_compatible=False ) S_approx_phase_lite = predict_using_tflite( stft_approx_phase_model_lite, batch_src ) # predict using tflite S_approx_phase = stft_approx_phase_model.predict( batch_src, batch_size=batch_size ) # predict using tf model assert_approx_phase(S_approx_phase_lite, S_approx_phase, atol=1e-2, acceptable_fail_ratio=0.01) # # test accuracy of approx Phase() stft_phase_model = _get_stft_model(Phase(), tflite_compatible=False) S_phase = stft_phase_model.predict(batch_src, batch_size=batch_size) # predict using tf model assert_approx_phase(S_approx_phase_lite, S_phase, atol=1e-2, acceptable_fail_ratio=0.01)