def test_framewise_rms_energy_vad_decisions(self): for path in audiofiles: s, r = audio.read_wav(path) vad = audio.framewise_rms_energy_vad_decisions(s, r, 25) assert (vad.numpy() == 1).all() vad = audio.framewise_rms_energy_vad_decisions(np.zeros(3*16000), 16000, 25) assert (vad.numpy() == 0).all()
def test_wav_to_pcm_data(self): for path in audiofiles: s, r = audio.read_wav(path) h, b = audio.wav_to_pcm_data(s, r) assert len(h.numpy()) == 44, "unexpected wav header length" assert h.numpy()[:4].decode("ascii") == "RIFF", "wav header did not begin with 'RIFF'" assert len(b.numpy()) == 2 * s.shape[0], "unexpected wav data length, expected sample width of 2"
def test_peak_normalize(self): for path in audiofiles: s, r = audio.read_wav(path) s1 = s + np.random.normal(0, 10, s.shape) for level in range(0, -10, -1): s2 = audio.peak_normalize(s1, dBFS=level) assert not np.isnan(s2.numpy()).any() assert np.max(np.abs(s2)) <= audio.dBFS_to_linear(level), "maximum amplitude cannot exceed given dBFS level after peak normalization"
def test_resample(self): for path in audiofiles: s1, r1 = audio.read_wav(path) s2, r2 = audio.pyfunc_resample(s1, r1, 2*r1) assert r2 == 2*r1 assert not np.isnan(s2.numpy()).any(), "NaNs after resampling" assert len(s2.shape) == len(s1.shape), "signal shape changed after resampling" assert s2.shape[0] == 2*s1.shape[0], "unexpected signal length after resampling"
def test_remove_silence(self): for path in audiofiles: s, r = audio.read_wav(path) s1 = audio.remove_silence(s, r) assert not np.isnan(s1.numpy()).any() assert s1.shape == s.shape s1 = audio.remove_silence(np.zeros(3*16000), 16000) assert not np.isnan(s1.numpy()).any() assert tf.size(s1) == 0
def test_linear_to_mel(self): for path in audiofiles: s, r = audio.read_wav(path) for num_mel_bins in range(10, 100, 15): powspecs = audio.spectrograms(np.expand_dims(s, 0), r) melspec = audio.linear_to_mel(powspecs, r, num_mel_bins=num_mel_bins)[0] assert not np.isnan(melspec.numpy()).any() assert melspec.shape[0] == powspecs[0].shape[0] assert melspec.shape[1] == num_mel_bins
def test_power_to_db(self): for top_db in range(10, 110, 10): for path in audiofiles: s, r = audio.read_wav(path) _, _, stft = scipy.signal.stft(s) powspec = np.abs(stft)**2 dbspec = audio.power_to_db(np.expand_dims(powspec, 0), top_db=float(top_db))[0].numpy() assert not np.isnan(dbspec).any() assert dbspec.max() <= 0
def test_write_mono_wav(self): for inpath in audiofiles: s, r = audio.read_wav(inpath) with tempfile.TemporaryDirectory() as tmpdir: outpath = os.path.join(tmpdir, os.path.basename(inpath)) wrotepath = audio.write_mono_wav(outpath, s, r) assert os.path.exists(outpath) assert wrotepath == outpath assert librosa.get_duration(filename=outpath, sr=None) == (s.shape[0] / r) assert librosa.get_samplerate(outpath) == r s1, r1 = librosa.load(outpath, sr=None) assert not np.isnan(s1).any() assert s1.shape == s.shape assert r1 == r
def test_spectrograms(self): for path in audiofiles: s, r = audio.read_wav(path) for len_ms in range(20, 101, 20): for n_fft in (256, 512, 1024, 2048): if n_fft < audio.ms_to_frames(r, len_ms): continue step_ms = len_ms // 2 powspec = audio.spectrograms(np.expand_dims(s, 0), r, frame_length_ms=len_ms, frame_step_ms=step_ms, fft_length=n_fft)[0] assert not np.isnan(powspec.numpy()).any() assert powspec.shape[0] == s.shape[0] // audio.ms_to_frames(r, step_ms) - 1 assert powspec.shape[1] == n_fft // 2 + 1
def add_random_noise_and_flatten(x): # Random noise path indexes and random snr levels rand_noise = [(noise_type, tf.random.uniform([], 0, tf.size(type2paths[noise_type]), tf.int64), tf.random.uniform([], snr_low, snr_high, tf.float32)) for noise_type, snr_low, snr_high in snr_list] # Load random noise signals with drawn indexes rand_noise = [ (audio_features.read_wav(type2paths[noise_type][rand_index]), snr) for noise_type, rand_index, snr in rand_noise ] # Assert sample rates # TODO maybe add inline resampling for (noise, sample_rate), snr in rand_noise: tf.debugging.assert_equal( sample_rate, x["sample_rate"], message= "Invalid noise signals are being used, all noise signals must have same sample rate as speech signals that are being augmented" ) # Fix noise signal length to match x["signal"] by repeating the noise signal if it is too short rand_noise = [ (tf.cast(tf.size(x["signal"]) / tf.size(noise), tf.int32), noise, snr) for (noise, _), snr in rand_noise ] rand_noise = [(tf.tile(noise, [1 + noise_length_ratio])[:tf.size(noise)], snr) for noise_length_ratio, noise, snr in rand_noise] mixed_signals = [ audio_features.snr_mixer(x["signal"], noise, snr)[2] for noise, snr in rand_noise ] new_ids = [ tf.strings.join( (x["id"], noise_type, tf.strings.as_string(snr, precision=2)), separator="-") for noise_type, snr in rand_noise ] signal_ds = tf.data.Dataset.from_tensor_slices(mixed_signals) repeat_x_ds = tf.data.Dataset.from_tensors(x).repeat( len(mixed_signals)) return (tf.data.Dataset.from_tensor_slices( (new_ids, mixed_signals, len(mixed_signals) * [x])).map(update_element_meta))
def _add_random_noise_and_flatten(x): """ Using snr_list, choose len(snr_list) noise signals randomly and create new signal samples by mixing the chosen noise signals with x["signal"] using random SNR dB levels. """ # Random noise path indexes and random snr levels rand_noise = [ (noise_type, tf.random.uniform([], 0, tf.size(type2paths[noise_type]), tf.int32), tf.random.uniform([], snr_low, snr_high, tf.float32)) for noise_type, snr_low, snr_high in snr_list] # Select random noise signals by drawn indexes and read contents from files rand_noise = [ (audio_features.read_wav(type2paths[noise_type][rand_index]), snr) for noise_type, rand_index, snr in rand_noise] # Assert sample rates # TODO maybe add inline resampling of noise signals so they match the speech sr for (noise, sample_rate), snr in rand_noise: tf.debugging.assert_equal(sample_rate, x["sample_rate"], message="Invalid noise signals are being used, all noise signals must have same sample rate as speech signals that are being augmented") # Fix noise signal length to match x["signal"] by repeating the noise signal if it is too short and then slicing it rand_noise = [ # How many multiples of `noise` fits in x["signal"] (tf.cast(tf.size(x["signal"]) / tf.size(noise), tf.int32), noise, snr) for (noise, _), snr in rand_noise] rand_noise = [ # Repeat noise and slice (tf.tile(noise, [1 + noise_length_ratio])[:tf.size(x["signal"])], snr) for noise_length_ratio, noise, snr in rand_noise] # Mix x["signal"] and chosen noise signals mixed_signals = [audio_features.snr_mixer(x["signal"], noise, snr)[2] for noise, snr in rand_noise] # Create new utterance ids that contain the mixed noise type and SNR level new_ids = [ tf.strings.join(( "augmented", x["id"], noise_type, tf.strings.join(("snr", tf.strings.as_string(snr, precision=2)))), separator="-") for (noise_type, _, _), (_, snr) in zip(snr_list, rand_noise)] # Create new elements from the mixed signals and return as dataset return (tf.data.Dataset .zip((tf.data.Dataset.from_tensor_slices(new_ids), tf.data.Dataset.from_tensor_slices(mixed_signals), tf.data.Dataset.from_tensors(x).repeat(len(mixed_signals)))) .map(_update_element_meta))
def test_read_wav(self): for path in audiofiles: s, r = audio.read_wav(path) assert not np.isnan(s.numpy()).any(), "NaNs in signal" assert s.shape == (3*16000,), "unexpected signal shape" assert r == 16000, "unexpected sample rate"
def append_signals(x): signal, sample_rate = audio_features.read_wav(x["path"]) return dict(x, signal=signal, sample_rate=sample_rate)