def test_decoded_vs_generated(): pipeline = DecoderPipeline() pipeline.build() idx = 0 for iter in range(1): out = pipeline.run() for i in range(len(out[0])): plain = out[0].at(i) res = out[1].at(i) mix = out[2].at(i)[:, np.newaxis] res_mix = out[3].at(i)[:, np.newaxis] ref_len = [0, 0, 0, 0] ref_len[0] = lengths[idx] ref_len[1] = lengths[idx] * rate1 / rates[idx] ref_len[2] = lengths[idx] ref_len[3] = lengths[idx] * rate2 / rates[idx] ref0 = generate_waveforms(ref_len[0], freqs[idx]) * 32767 ref1 = generate_waveforms(ref_len[1], freqs[idx] * (rates[idx] / rate1)) * 32767 ref2 = generate_waveforms(ref_len[2], freqs[idx]) * 32767 ref2 = ref2.mean(axis=1, keepdims=1) ref3 = generate_waveforms(ref_len[3], freqs[idx] * (rates[idx] / rate2)) ref3 = ref3.mean(axis=1, keepdims=1) assert out[4].at(i) == rates[idx] assert out[5].at(i) == rate1 assert out[6].at(i) == rates[idx] assert out[7].at(i) == rate2 # just reading - allow only for rounding assert np.allclose(plain, ref0, rtol=0, atol=0.5) # resampling - allow for 1e-3 dynamic range error assert np.allclose(res, ref1, rtol=0, atol=32767 * 1e-3) # downmixing - allow for 2 bits of error # - one for quantization of channels, one for quantization of result assert np.allclose(mix, ref2, rtol=0, atol=2) # resampling with weird ratio - allow for 3e-3 dynamic range error assert np.allclose(res_mix, ref3, rtol=0, atol=3e-3) rosa_in1 = plain.astype(np.float32) rosa1 = rosa_resample(rosa_in1, rates[idx], rate1) rosa_in3 = rosa_in1 / 32767 rosa3 = rosa_resample(rosa_in3.mean(axis=1, keepdims=1), rates[idx], rate2) assert np.allclose(res, rosa1, rtol=0, atol=32767 * 1e-3) assert np.allclose(res_mix, rosa3, rtol=0, atol=3e-3) idx = (idx + 1) % len(names)
def create_ref(): ref = [] for i in range(len(names)): wave = generate_waveforms(lengths[i], freqs[i]) wave = (wave * 32767).round().astype(np.int16) ref.append(wave) return ref
def create_files(): for i in range(len(names)): wave = generate_waveforms(lengths[i], freqs[i]) wave = (wave * 32767).round().astype(np.int16) scipy.io.wavfile.write(names[i], rates[i], wave)
def test_decoded_vs_generated(batch_size=3): pipeline = NemoAsrReaderPipeline(batch_size=batch_size) pipeline.build() for iter in range(1): out = pipeline.run() for idx in range(batch_size): audio_plain_i = out[0].at(idx) audio_plain_f = out[1].at(idx) audio_downmix_i = out[2].at(idx) audio_downmix_f = out[3].at(idx) audio_resampled1_i = out[4].at(idx) audio_resampled1_f = out[5].at(idx) audio_resampled2_i = out[6].at(idx) audio_resampled2_f = out[7].at(idx) text = out[8].at(idx) text_non_ascii = out[9].at(idx) ref_plain_i = ref_i[idx] np.testing.assert_allclose(audio_plain_i, ref_plain_i, rtol=1e-7) ref_plain_f = ref_i[idx].astype(np.float32) / 32767 np.testing.assert_allclose(audio_plain_f, ref_plain_f, rtol=1e-4) ref_downmix_i_float = ref_i[idx].astype(np.float32).mean( axis=1, keepdims=1) ref_downmix_i = ref_downmix_i_float.astype(np.int16).flatten() np.testing.assert_allclose(audio_downmix_i, ref_downmix_i, atol=1) ref_downmix_f = (ref_downmix_i_float / 32767).flatten() np.testing.assert_allclose(audio_downmix_f, ref_downmix_f, rtol=1e-4) ref_resampled1_float = generate_waveforms( lengths[idx] * rate1 / rates[idx], freqs[idx] * (rates[idx] / rate1)) ref_resampled1_downmix = ref_resampled1_float.astype( np.float32).mean(axis=1, keepdims=1) ref_resampled1_i = (ref_resampled1_downmix * 32767).astype( np.int16).flatten() # resampling - allow for 1e-3 dynamic range error np.testing.assert_allclose(audio_resampled1_i, ref_resampled1_i, atol=round(32767 * 1e-3)) ref_resampled1_f = ref_resampled1_downmix.flatten() # resampling - allow for 1e-3 dynamic range error np.testing.assert_allclose(audio_resampled1_f, ref_resampled1_f, atol=1e-3) ref_resampled2_float = generate_waveforms( lengths[idx] * rate2 / rates[idx], freqs[idx] * (rates[idx] / rate2)) ref_resampled2_downmix = ref_resampled2_float.astype( np.float32).mean(axis=1, keepdims=1) ref_resampled2_i = (ref_resampled2_downmix * 32767).astype( np.int16).flatten() # resampling - allow for 1e-3 dynamic range error np.testing.assert_allclose(audio_resampled2_i, ref_resampled2_i, atol=round(32767 * 1e-3)) ref_resampled2_f = ref_resampled2_downmix.flatten() # resampling - allow for 1e-3 dynamic range error np.testing.assert_allclose(audio_resampled2_f, ref_resampled2_f, atol=1e-3) np.testing.assert_equal(text, ref_text[idx]) np.testing.assert_equal(text_non_ascii, ref_text_non_ascii[idx]) text_non_ascii_str = str(text_non_ascii.tobytes(), encoding='utf8') # Checking that we don't have any trailing zeros # (those won't be caught by the string comparison) ref_text_non_ascii_literal_bytes = bytes( ref_text_non_ascii_literal[idx], 'utf8') assert text_non_ascii.tobytes() == ref_text_non_ascii_literal_bytes, \ f"'{text_non_ascii.tobytes()}' != '{ref_text_non_ascii_literal_bytes}'" # String comparison (utf-8) assert text_non_ascii_str == ref_text_non_ascii_literal[idx], \ f"'{text_non_ascii_str}' != '{ref_text_non_ascii_literal[idx]}'"
def test_decoded_vs_generated(batch_size=3): pipeline = NemoAsrReaderPipeline(batch_size=batch_size) pipeline.build() for iter in range(1): out = pipeline.run() for idx in range(batch_size): audio_plain_i = out[0].at(idx) audio_plain_f = out[1].at(idx) audio_downmix_i = out[2].at(idx) audio_downmix_f = out[3].at(idx) audio_resampled1_i = out[4].at(idx) audio_resampled1_f = out[5].at(idx) audio_resampled2_i = out[6].at(idx) audio_resampled2_f = out[7].at(idx) text = out[8].at(idx) ref_plain_i = ref_i[idx] np.testing.assert_allclose(audio_plain_i, ref_plain_i, rtol=1e-7) ref_plain_f = ref_i[idx].astype(np.float32) / 32767 np.testing.assert_allclose(audio_plain_f, ref_plain_f, rtol=1e-4) ref_downmix_i_float = ref_i[idx].astype(np.float32).mean( axis=1, keepdims=1) ref_downmix_i = ref_downmix_i_float.astype(np.int16).flatten() np.testing.assert_allclose(audio_downmix_i, ref_downmix_i, atol=1) ref_downmix_f = (ref_downmix_i_float / 32767).flatten() np.testing.assert_allclose(audio_downmix_f, ref_downmix_f, rtol=1e-4) ref_resampled1_float = generate_waveforms( lengths[idx] * rate1 / rates[idx], freqs[idx] * (rates[idx] / rate1)) ref_resampled1_downmix = ref_resampled1_float.astype( np.float32).mean(axis=1, keepdims=1) ref_resampled1_i = (ref_resampled1_downmix * 32767).astype( np.int16).flatten() # resampling - allow for 1e-3 dynamic range error np.testing.assert_allclose(audio_resampled1_i, ref_resampled1_i, atol=round(32767 * 1e-3)) ref_resampled1_f = ref_resampled1_downmix.flatten() # resampling - allow for 1e-3 dynamic range error np.testing.assert_allclose(audio_resampled1_f, ref_resampled1_f, atol=1e-3) ref_resampled2_float = generate_waveforms( lengths[idx] * rate2 / rates[idx], freqs[idx] * (rates[idx] / rate2)) ref_resampled2_downmix = ref_resampled2_float.astype( np.float32).mean(axis=1, keepdims=1) ref_resampled2_i = (ref_resampled2_downmix * 32767).astype( np.int16).flatten() # resampling - allow for 1e-3 dynamic range error np.testing.assert_allclose(audio_resampled2_i, ref_resampled2_i, atol=round(32767 * 1e-3)) ref_resampled2_f = ref_resampled2_downmix.flatten() # resampling - allow for 1e-3 dynamic range error np.testing.assert_allclose(audio_resampled2_f, ref_resampled2_f, atol=1e-3) np.testing.assert_equal(text, ref_text[idx])