Exemplo n.º 1
0
def test_decoded_vs_generated():
    pipeline = DecoderPipeline()
    pipeline.build()
    idx = 0
    for iter in range(1):
        out = pipeline.run()
        for i in range(len(out[0])):
            plain = out[0].at(i)
            res = out[1].at(i)
            mix = out[2].at(i)[:, np.newaxis]
            res_mix = out[3].at(i)[:, np.newaxis]

            ref_len = [0, 0, 0, 0]
            ref_len[0] = lengths[idx]
            ref_len[1] = lengths[idx] * rate1 / rates[idx]
            ref_len[2] = lengths[idx]
            ref_len[3] = lengths[idx] * rate2 / rates[idx]

            ref0 = generate_waveforms(ref_len[0], freqs[idx]) * 32767
            ref1 = generate_waveforms(ref_len[1], freqs[idx] *
                                      (rates[idx] / rate1)) * 32767
            ref2 = generate_waveforms(ref_len[2], freqs[idx]) * 32767
            ref2 = ref2.mean(axis=1, keepdims=1)
            ref3 = generate_waveforms(ref_len[3],
                                      freqs[idx] * (rates[idx] / rate2))
            ref3 = ref3.mean(axis=1, keepdims=1)

            assert out[4].at(i) == rates[idx]
            assert out[5].at(i) == rate1
            assert out[6].at(i) == rates[idx]
            assert out[7].at(i) == rate2

            # just reading - allow only for rounding
            assert np.allclose(plain, ref0, rtol=0, atol=0.5)
            # resampling - allow for 1e-3 dynamic range error
            assert np.allclose(res, ref1, rtol=0, atol=32767 * 1e-3)
            # downmixing - allow for 2 bits of error
            # - one for quantization of channels, one for quantization of result
            assert np.allclose(mix, ref2, rtol=0, atol=2)
            # resampling with weird ratio - allow for 3e-3 dynamic range error
            assert np.allclose(res_mix, ref3, rtol=0, atol=3e-3)

            rosa_in1 = plain.astype(np.float32)
            rosa1 = rosa_resample(rosa_in1, rates[idx], rate1)
            rosa_in3 = rosa_in1 / 32767
            rosa3 = rosa_resample(rosa_in3.mean(axis=1, keepdims=1),
                                  rates[idx], rate2)

            assert np.allclose(res, rosa1, rtol=0, atol=32767 * 1e-3)
            assert np.allclose(res_mix, rosa3, rtol=0, atol=3e-3)

            idx = (idx + 1) % len(names)
Exemplo n.º 2
0
def create_ref():
    ref = []
    for i in range(len(names)):
        wave = generate_waveforms(lengths[i], freqs[i])
        wave = (wave * 32767).round().astype(np.int16)
        ref.append(wave)
    return ref
Exemplo n.º 3
0
def create_files():
    for i in range(len(names)):
        wave = generate_waveforms(lengths[i], freqs[i])
        wave = (wave * 32767).round().astype(np.int16)
        scipy.io.wavfile.write(names[i], rates[i], wave)
Exemplo n.º 4
0
def test_decoded_vs_generated(batch_size=3):
    pipeline = NemoAsrReaderPipeline(batch_size=batch_size)
    pipeline.build()

    for iter in range(1):
        out = pipeline.run()
        for idx in range(batch_size):
            audio_plain_i = out[0].at(idx)
            audio_plain_f = out[1].at(idx)
            audio_downmix_i = out[2].at(idx)
            audio_downmix_f = out[3].at(idx)
            audio_resampled1_i = out[4].at(idx)
            audio_resampled1_f = out[5].at(idx)
            audio_resampled2_i = out[6].at(idx)
            audio_resampled2_f = out[7].at(idx)
            text = out[8].at(idx)
            text_non_ascii = out[9].at(idx)

            ref_plain_i = ref_i[idx]
            np.testing.assert_allclose(audio_plain_i, ref_plain_i, rtol=1e-7)

            ref_plain_f = ref_i[idx].astype(np.float32) / 32767
            np.testing.assert_allclose(audio_plain_f, ref_plain_f, rtol=1e-4)

            ref_downmix_i_float = ref_i[idx].astype(np.float32).mean(
                axis=1, keepdims=1)

            ref_downmix_i = ref_downmix_i_float.astype(np.int16).flatten()
            np.testing.assert_allclose(audio_downmix_i, ref_downmix_i, atol=1)

            ref_downmix_f = (ref_downmix_i_float / 32767).flatten()
            np.testing.assert_allclose(audio_downmix_f,
                                       ref_downmix_f,
                                       rtol=1e-4)

            ref_resampled1_float = generate_waveforms(
                lengths[idx] * rate1 / rates[idx],
                freqs[idx] * (rates[idx] / rate1))
            ref_resampled1_downmix = ref_resampled1_float.astype(
                np.float32).mean(axis=1, keepdims=1)
            ref_resampled1_i = (ref_resampled1_downmix * 32767).astype(
                np.int16).flatten()
            # resampling - allow for 1e-3 dynamic range error
            np.testing.assert_allclose(audio_resampled1_i,
                                       ref_resampled1_i,
                                       atol=round(32767 * 1e-3))

            ref_resampled1_f = ref_resampled1_downmix.flatten()
            # resampling - allow for 1e-3 dynamic range error
            np.testing.assert_allclose(audio_resampled1_f,
                                       ref_resampled1_f,
                                       atol=1e-3)

            ref_resampled2_float = generate_waveforms(
                lengths[idx] * rate2 / rates[idx],
                freqs[idx] * (rates[idx] / rate2))
            ref_resampled2_downmix = ref_resampled2_float.astype(
                np.float32).mean(axis=1, keepdims=1)
            ref_resampled2_i = (ref_resampled2_downmix * 32767).astype(
                np.int16).flatten()
            # resampling - allow for 1e-3 dynamic range error
            np.testing.assert_allclose(audio_resampled2_i,
                                       ref_resampled2_i,
                                       atol=round(32767 * 1e-3))

            ref_resampled2_f = ref_resampled2_downmix.flatten()
            # resampling - allow for 1e-3 dynamic range error
            np.testing.assert_allclose(audio_resampled2_f,
                                       ref_resampled2_f,
                                       atol=1e-3)

            np.testing.assert_equal(text, ref_text[idx])

            np.testing.assert_equal(text_non_ascii, ref_text_non_ascii[idx])
            text_non_ascii_str = str(text_non_ascii.tobytes(), encoding='utf8')

            # Checking that we don't have any trailing zeros
            # (those won't be caught by the string comparison)
            ref_text_non_ascii_literal_bytes = bytes(
                ref_text_non_ascii_literal[idx], 'utf8')
            assert text_non_ascii.tobytes() == ref_text_non_ascii_literal_bytes, \
                f"'{text_non_ascii.tobytes()}' != '{ref_text_non_ascii_literal_bytes}'"

            # String comparison (utf-8)
            assert text_non_ascii_str == ref_text_non_ascii_literal[idx], \
                f"'{text_non_ascii_str}' != '{ref_text_non_ascii_literal[idx]}'"
Exemplo n.º 5
0
def test_decoded_vs_generated(batch_size=3):
    pipeline = NemoAsrReaderPipeline(batch_size=batch_size)
    pipeline.build()

    for iter in range(1):
        out = pipeline.run()
        for idx in range(batch_size):
            audio_plain_i = out[0].at(idx)
            audio_plain_f = out[1].at(idx)
            audio_downmix_i = out[2].at(idx)
            audio_downmix_f = out[3].at(idx)
            audio_resampled1_i = out[4].at(idx)
            audio_resampled1_f = out[5].at(idx)
            audio_resampled2_i = out[6].at(idx)
            audio_resampled2_f = out[7].at(idx)
            text = out[8].at(idx)

            ref_plain_i = ref_i[idx]
            np.testing.assert_allclose(audio_plain_i, ref_plain_i, rtol=1e-7)

            ref_plain_f = ref_i[idx].astype(np.float32) / 32767
            np.testing.assert_allclose(audio_plain_f, ref_plain_f, rtol=1e-4)

            ref_downmix_i_float = ref_i[idx].astype(np.float32).mean(
                axis=1, keepdims=1)

            ref_downmix_i = ref_downmix_i_float.astype(np.int16).flatten()
            np.testing.assert_allclose(audio_downmix_i, ref_downmix_i, atol=1)

            ref_downmix_f = (ref_downmix_i_float / 32767).flatten()
            np.testing.assert_allclose(audio_downmix_f,
                                       ref_downmix_f,
                                       rtol=1e-4)

            ref_resampled1_float = generate_waveforms(
                lengths[idx] * rate1 / rates[idx],
                freqs[idx] * (rates[idx] / rate1))
            ref_resampled1_downmix = ref_resampled1_float.astype(
                np.float32).mean(axis=1, keepdims=1)
            ref_resampled1_i = (ref_resampled1_downmix * 32767).astype(
                np.int16).flatten()
            # resampling - allow for 1e-3 dynamic range error
            np.testing.assert_allclose(audio_resampled1_i,
                                       ref_resampled1_i,
                                       atol=round(32767 * 1e-3))

            ref_resampled1_f = ref_resampled1_downmix.flatten()
            # resampling - allow for 1e-3 dynamic range error
            np.testing.assert_allclose(audio_resampled1_f,
                                       ref_resampled1_f,
                                       atol=1e-3)

            ref_resampled2_float = generate_waveforms(
                lengths[idx] * rate2 / rates[idx],
                freqs[idx] * (rates[idx] / rate2))
            ref_resampled2_downmix = ref_resampled2_float.astype(
                np.float32).mean(axis=1, keepdims=1)
            ref_resampled2_i = (ref_resampled2_downmix * 32767).astype(
                np.int16).flatten()
            # resampling - allow for 1e-3 dynamic range error
            np.testing.assert_allclose(audio_resampled2_i,
                                       ref_resampled2_i,
                                       atol=round(32767 * 1e-3))

            ref_resampled2_f = ref_resampled2_downmix.flatten()
            # resampling - allow for 1e-3 dynamic range error
            np.testing.assert_allclose(audio_resampled2_f,
                                       ref_resampled2_f,
                                       atol=1e-3)

            np.testing.assert_equal(text, ref_text[idx])