예제 #1
0
    def test_save_flac(self, sample_rate, num_channels, compression_level):
        ts_save_func = torch_script(py_save_func)

        expected = get_wav_data('float32', num_channels)
        py_path = self.get_temp_path(
            f'test_save_py_{sample_rate}_{num_channels}_{compression_level}.flac'
        )
        ts_path = self.get_temp_path(
            f'test_save_ts_{sample_rate}_{num_channels}_{compression_level}.flac'
        )

        py_save_func(py_path, expected, sample_rate, True, compression_level,
                     None, None)
        ts_save_func(ts_path, expected, sample_rate, True, compression_level,
                     None, None)

        # converting to 32 bit because flac file has 24 bit depth which scipy cannot handle.
        py_path_wav = f'{py_path}.wav'
        ts_path_wav = f'{ts_path}.wav'
        sox_utils.convert_audio_file(py_path, py_path_wav, bit_depth=32)
        sox_utils.convert_audio_file(ts_path, ts_path_wav, bit_depth=32)

        py_data, py_sr = load_wav(py_path_wav, normalize=True)
        ts_data, ts_sr = load_wav(ts_path_wav, normalize=True)

        self.assertEqual(sample_rate, py_sr)
        self.assertEqual(sample_rate, ts_sr)
        self.assertEqual(expected, py_data)
        self.assertEqual(expected, ts_data)
예제 #2
0
    def assert_amr_nb(self, duration):
        """`sox_io_backend.load` can load amr-nb format.

        This test takes the same strategy as mp3 to compare the result
        """
        sample_rate = 8000
        num_channels = 1
        path = self.get_temp_path('1.original.amr-nb')
        ref_path = self.get_temp_path('2.reference.wav')

        # 1. Generate amr-nb with sox
        sox_utils.gen_audio_file(path,
                                 sample_rate,
                                 num_channels,
                                 bit_depth=32,
                                 duration=duration)
        # 2. Convert to wav with sox
        sox_utils.convert_audio_file(path, ref_path)
        # 3. Load amr-nb with torchaudio
        data, sr = sox_io_backend.load(path)
        # 4. Load wav with scipy
        data_ref = load_wav(ref_path)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
예제 #3
0
    def assert_vorbis(self, sample_rate, num_channels, quality_level,
                      duration):
        """`sox_io_backend.load` can load vorbis format.

        This test takes the same strategy as mp3 to compare the result
        """
        path = self.get_temp_path('1.original.vorbis')
        ref_path = self.get_temp_path('2.reference.wav')

        # 1. Generate vorbis with sox
        sox_utils.gen_audio_file(path,
                                 sample_rate,
                                 num_channels,
                                 compression=quality_level,
                                 bit_depth=16,
                                 duration=duration)
        # 2. Convert to wav with sox
        sox_utils.convert_audio_file(path, ref_path)
        # 3. Load vorbis with torchaudio
        data, sr = sox_io_backend.load(path)
        # 4. Load wav with scipy
        data_ref = load_wav(ref_path)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
예제 #4
0
    def test_opus(self, bitrate, num_channels, compression_level):
        """`sox_io_backend.load` can load opus file correctly."""
        ops_path = get_asset_path('io', f'{bitrate}_{compression_level}_{num_channels}ch.opus')
        wav_path = self.get_temp_path(f'{bitrate}_{compression_level}_{num_channels}ch.opus.wav')
        sox_utils.convert_audio_file(ops_path, wav_path)

        expected, sample_rate = load_wav(wav_path)
        found, sr = sox_io_backend.load(ops_path)

        assert sample_rate == sr
        self.assertEqual(expected, found)
예제 #5
0
    def assert_mp3(self, sample_rate, num_channels, bit_rate, duration):
        """`sox_io_backend.load` can load mp3 format.

        mp3 encoding introduces delay and boundary effects so
        we create reference wav file from mp3

         x
         |
         |    1. Generate mp3 with Sox
         |
         v    2. Convert to wav with Sox
        mp3 ------------------------------> wav
         |                                   |
         | 3. Load with torchaudio           | 4. Load with scipy
         |                                   |
         v                                   v
        tensor ----------> x <----------- tensor
                       5. Compare

        Underlying assumptions are:
        i. Conversion of mp3 to wav with Sox preserves data.
        ii. Loading wav file with scipy is correct.

        By combining i & ii, step 2. and 4. allows to load reference mp3 data
        without using torchaudio
        """
        path = self.get_temp_path('1.original.mp3')
        ref_path = self.get_temp_path('2.reference.wav')

        # 1. Generate mp3 with sox
        sox_utils.gen_audio_file(path,
                                 sample_rate,
                                 num_channels,
                                 compression=bit_rate,
                                 duration=duration)
        # 2. Convert to wav with sox
        sox_utils.convert_audio_file(path, ref_path)
        # 3. Load mp3 with torchaudio
        data, sr = sox_io_backend.load(path)
        # 4. Load wav with scipy
        data_ref = load_wav(ref_path)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=3e-03, rtol=1.3e-06)
예제 #6
0
    def assert_24bit_wav(self, sample_rate, num_channels, normalize, duration):
        """ `sox_io_backend.load` can load 24-bit signed PCM wav format. Since torch does not support the ``int24`` dtype,
        we implicitly cast the resulting tensor to the ``int32`` dtype.

        It is not possible to use #assert_wav method above, as #get_wav_data does not support
        the 'int24' dtype. This is because torch does not support the ``int24`` dtype.
        Hence, we must use the following workaround.

         x
         |
         |    1. Generate 24-bit wav with Sox.
         |
         v    2. Convert 24-bit wav to 32-bit wav with Sox.
      wav(24-bit) ----------------------> wav(32-bit)
         |                                   |
         | 3. Load 24-bit wav with torchaudio| 4. Load 32-bit wav with scipy
         |                                   |
         v                                   v
        tensor ----------> x <----------- tensor
                       5. Compare

        # Underlying assumptions are:
        # i. Sox properly converts from 24-bit to 32-bit
        # ii. Loading 32-bit wav file with scipy is correct.
        """
        path = self.get_temp_path('1.original.wav')
        ref_path = self.get_temp_path('2.reference.wav')

        # 1. Generate 24-bit signed wav with Sox
        sox_utils.gen_audio_file(path,
                                 sample_rate,
                                 num_channels,
                                 bit_depth=24,
                                 duration=duration)

        # 2. Convert from 24-bit wav to 32-bit wav with sox
        sox_utils.convert_audio_file(path, ref_path, bit_depth=32)
        # 3. Load 24-bit wav with torchaudio
        data, sr = sox_io_backend.load(path, normalize=normalize)
        # 4. Load 32-bit wav with scipy
        data_ref = load_wav(ref_path, normalize=normalize)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=3e-03, rtol=1.3e-06)
예제 #7
0
    def assert_mp3(self, sample_rate, num_channels, bit_rate, duration):
        """`sox_io_backend.save` can save mp3 format.

        mp3 encoding introduces delay and boundary effects so
        we convert the resulting mp3 to wav and compare the results there

                          |
                          | 1. Generate original wav file with SciPy
                          |
                          v
          -------------- wav ----------------
         |                                   |
         | 2.1. load with scipy              | 3.1. Convert to mp3 with Sox
         | then save with torchaudio         |
         v                                   v
        mp3                                 mp3
         |                                   |
         | 2.2. Convert to wav with Sox      | 3.2. Convert to wav with Sox
         |                                   |
         v                                   v
        wav                                 wav
         |                                   |
         | 2.3. load with scipy              | 3.3. load with scipy
         |                                   |
         v                                   v
        tensor -------> compare <--------- tensor

        """
        src_path = self.get_temp_path('1.reference.wav')
        mp3_path = self.get_temp_path('2.1.torchaudio.mp3')
        wav_path = self.get_temp_path('2.2.torchaudio.wav')
        mp3_path_sox = self.get_temp_path('3.1.sox.mp3')
        wav_path_sox = self.get_temp_path('3.2.sox.wav')

        # 1. Generate original wav
        data = get_wav_data('float32',
                            num_channels,
                            normalize=True,
                            num_frames=duration * sample_rate)
        save_wav(src_path, data, sample_rate)
        # 2.1. Convert the original wav to mp3 with torchaudio
        sox_io_backend.save(mp3_path,
                            load_wav(src_path)[0],
                            sample_rate,
                            compression=bit_rate)
        # 2.2. Convert the mp3 to wav with Sox
        sox_utils.convert_audio_file(mp3_path, wav_path)
        # 2.3. Load
        found = load_wav(wav_path)[0]

        # 3.1. Convert the original wav to mp3 with SoX
        sox_utils.convert_audio_file(src_path,
                                     mp3_path_sox,
                                     compression=bit_rate)
        # 3.2. Convert the mp3 to wav with Sox
        sox_utils.convert_audio_file(mp3_path_sox, wav_path_sox)
        # 3.3. Load
        expected = load_wav(wav_path_sox)[0]

        self.assertEqual(found, expected)
예제 #8
0
    def assert_amb(self, dtype, sample_rate, num_channels, duration):
        """`sox_io_backend.save` can save amb format.

        This test takes the same strategy as mp3 to compare the result
        """
        src_path = self.get_temp_path('1.reference.wav')
        amb_path = self.get_temp_path('2.1.torchaudio.amb')
        wav_path = self.get_temp_path('2.2.torchaudio.wav')
        amb_path_sox = self.get_temp_path('3.1.sox.amb')
        wav_path_sox = self.get_temp_path('3.2.sox.wav')

        # 1. Generate original wav
        data = get_wav_data(dtype,
                            num_channels,
                            normalize=False,
                            num_frames=duration * sample_rate)
        save_wav(src_path, data, sample_rate)
        # 2.1. Convert the original wav to amb with torchaudio
        sox_io_backend.save(amb_path,
                            load_wav(src_path, normalize=False)[0],
                            sample_rate)
        # 2.2. Convert the amb to wav with Sox
        sox_utils.convert_audio_file(amb_path, wav_path)
        # 2.3. Load
        found = load_wav(wav_path)[0]

        # 3.1. Convert the original wav to amb with SoX
        sox_utils.convert_audio_file(src_path, amb_path_sox)
        # 3.2. Convert the amb to wav with Sox
        sox_utils.convert_audio_file(amb_path_sox, wav_path_sox)
        # 3.3. Load
        expected = load_wav(wav_path_sox)[0]

        self.assertEqual(found, expected)
예제 #9
0
    def assert_sphere(self, sample_rate, num_channels, duration):
        """`sox_io_backend.save` can save sph format.

        This test takes the same strategy as mp3 to compare the result
        """
        src_path = self.get_temp_path('1.reference.wav')
        flc_path = self.get_temp_path('2.1.torchaudio.sph')
        wav_path = self.get_temp_path('2.2.torchaudio.wav')
        flc_path_sox = self.get_temp_path('3.1.sox.sph')
        wav_path_sox = self.get_temp_path('3.2.sox.wav')

        # 1. Generate original wav
        data = get_wav_data('float32',
                            num_channels,
                            normalize=True,
                            num_frames=duration * sample_rate)
        save_wav(src_path, data, sample_rate)
        # 2.1. Convert the original wav to sph with torchaudio
        sox_io_backend.save(flc_path, load_wav(src_path)[0], sample_rate)
        # 2.2. Convert the sph to wav with Sox
        # converting to 32 bit because sph file has 24 bit depth which scipy cannot handle.
        sox_utils.convert_audio_file(flc_path, wav_path, bit_depth=32)
        # 2.3. Load
        found = load_wav(wav_path)[0]

        # 3.1. Convert the original wav to sph with SoX
        sox_utils.convert_audio_file(src_path, flc_path_sox)
        # 3.2. Convert the sph to wav with Sox
        # converting to 32 bit because sph file has 24 bit depth which scipy cannot handle.
        sox_utils.convert_audio_file(flc_path_sox, wav_path_sox, bit_depth=32)
        # 3.3. Load
        expected = load_wav(wav_path_sox)[0]

        self.assertEqual(found, expected)
예제 #10
0
    def _assert_vorbis(self, sample_rate, num_channels, quality_level,
                       duration):
        """`sox_io_backend.save` can save vorbis format.

        This test takes the same strategy as mp3 to compare the result
        """
        src_path = self.get_temp_path('1.reference.wav')
        vbs_path = self.get_temp_path('2.1.torchaudio.vorbis')
        wav_path = self.get_temp_path('2.2.torchaudio.wav')
        vbs_path_sox = self.get_temp_path('3.1.sox.vorbis')
        wav_path_sox = self.get_temp_path('3.2.sox.wav')

        # 1. Generate original wav
        data = get_wav_data('int16',
                            num_channels,
                            normalize=False,
                            num_frames=duration * sample_rate)
        save_wav(src_path, data, sample_rate)
        # 2.1. Convert the original wav to vorbis with torchaudio
        sox_io_backend.save(vbs_path,
                            load_wav(src_path)[0],
                            sample_rate,
                            compression=quality_level,
                            dtype=None)
        # 2.2. Convert the vorbis to wav with Sox
        sox_utils.convert_audio_file(vbs_path, wav_path)
        # 2.3. Load
        found = load_wav(wav_path)[0]

        # 3.1. Convert the original wav to vorbis with SoX
        sox_utils.convert_audio_file(src_path,
                                     vbs_path_sox,
                                     compression=quality_level)
        # 3.2. Convert the vorbis to wav with Sox
        sox_utils.convert_audio_file(vbs_path_sox, wav_path_sox)
        # 3.3. Load
        expected = load_wav(wav_path_sox)[0]

        # sox's vorbis encoding has some random boundary effect, which cause small number of
        # samples yields higher descrepency than the others.
        # so we allow small portions of data to be outside of absolute torelance.
        # make sure to pass somewhat long duration
        atol = 1.0e-4
        max_failure_allowed = 0.01  # this percent of samples are allowed to outside of atol.
        failure_ratio = (
            (found - expected).abs() > atol).sum().item() / found.numel()
        if failure_ratio > max_failure_allowed:
            # it's failed and this will give a better error message.
            self.assertEqual(found, expected, atol=atol, rtol=1.3e-6)
예제 #11
0
    def assert_format(
        self,
        format: str,
        sample_rate: float,
        num_channels: int,
        compression: float = None,
        bit_depth: int = None,
        duration: float = 1,
        normalize: bool = True,
        encoding: str = None,
        atol: float = 4e-05,
        rtol: float = 1.3e-06,
    ):
        """`sox_io_backend.load` can load given format correctly.

        file encodings introduce delay and boundary effects so
        we create a reference wav file from the original file format

         x
         |
         |    1. Generate given format with Sox
         |
         v    2. Convert to wav with Sox
        given format ----------------------> wav
         |                                   |
         |    3. Load with torchaudio        | 4. Load with scipy
         |                                   |
         v                                   v
        tensor ----------> x <----------- tensor
                       5. Compare

        Underlying assumptions are;
        i. Conversion of given format to wav with Sox preserves data.
        ii. Loading wav file with scipy is correct.

        By combining i & ii, step 2. and 4. allows to load reference given format
        data without using torchaudio
        """

        path = self.get_temp_path(f'1.original.{format}')
        ref_path = self.get_temp_path('2.reference.wav')

        # 1. Generate the given format with sox
        sox_utils.gen_audio_file(
            path,
            sample_rate,
            num_channels,
            encoding=encoding,
            compression=compression,
            bit_depth=bit_depth,
            duration=duration,
        )
        # 2. Convert to wav with sox
        wav_bit_depth = 32 if bit_depth == 24 else None  # for 24-bit wav
        sox_utils.convert_audio_file(path, ref_path, bit_depth=wav_bit_depth)
        # 3. Load the given format with torchaudio
        data, sr = sox_io_backend.load(path, normalize=normalize)
        # 4. Load wav with scipy
        data_ref = load_wav(ref_path, normalize=normalize)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=atol, rtol=rtol)
예제 #12
0
    def assert_save_consistency(
        self,
        format: str,
        *,
        compression: float = None,
        encoding: str = None,
        bits_per_sample: int = None,
        sample_rate: float = 8000,
        num_channels: int = 2,
        num_frames: float = 3 * 8000,
        src_dtype: str = 'int32',
        test_mode: str = "path",
    ):
        """`save` function produces file that is comparable with `sox` command

        To compare that the file produced by `save` function agains the file produced by
        the equivalent `sox` command, we need to load both files.
        But there are many formats that cannot be opened with common Python modules (like
        SciPy).
        So we use `sox` command to prepare the original data and convert the saved files
        into a format that SciPy can read (PCM wav).
        The following diagram illustrates this process. The difference is 2.1. and 3.1.

        This assumes that
         - loading data with SciPy preserves the data well.
         - converting the resulting files into WAV format with `sox` preserve the data well.

                          x
                          | 1. Generate source wav file with SciPy
                          |
                          v
          -------------- wav ----------------
         |                                   |
         | 2.1. load with scipy              | 3.1. Convert to the target
         |   then save it into the target    |      format depth with sox
         |   format with torchaudio          |
         v                                   v
        target format                       target format
         |                                   |
         | 2.2. Convert to wav with sox      | 3.2. Convert to wav with sox
         |                                   |
         v                                   v
        wav                                 wav
         |                                   |
         | 2.3. load with scipy              | 3.3. load with scipy
         |                                   |
         v                                   v
        tensor -------> compare <--------- tensor

        """
        cmp_encoding = 'floating-point'
        cmp_bit_depth = 32

        src_path = self.get_temp_path('1.source.wav')
        tgt_path = self.get_temp_path(f'2.1.torchaudio.{format}')
        tst_path = self.get_temp_path('2.2.result.wav')
        sox_path = self.get_temp_path(f'3.1.sox.{format}')
        ref_path = self.get_temp_path('3.2.ref.wav')

        # 1. Generate original wav
        data = get_wav_data(src_dtype,
                            num_channels,
                            normalize=False,
                            num_frames=num_frames)
        save_wav(src_path, data, sample_rate)

        # 2.1. Convert the original wav to target format with torchaudio
        data = load_wav(src_path, normalize=False)[0]
        if test_mode == "path":
            sox_io_backend.save(tgt_path,
                                data,
                                sample_rate,
                                compression=compression,
                                encoding=encoding,
                                bits_per_sample=bits_per_sample)
        elif test_mode == "fileobj":
            with open(tgt_path, 'bw') as file_:
                sox_io_backend.save(file_,
                                    data,
                                    sample_rate,
                                    format=format,
                                    compression=compression,
                                    encoding=encoding,
                                    bits_per_sample=bits_per_sample)
        elif test_mode == "bytesio":
            file_ = io.BytesIO()
            sox_io_backend.save(file_,
                                data,
                                sample_rate,
                                format=format,
                                compression=compression,
                                encoding=encoding,
                                bits_per_sample=bits_per_sample)
            file_.seek(0)
            with open(tgt_path, 'bw') as f:
                f.write(file_.read())
        else:
            raise ValueError(f"Unexpected test mode: {test_mode}")
        # 2.2. Convert the target format to wav with sox
        sox_utils.convert_audio_file(tgt_path,
                                     tst_path,
                                     encoding=cmp_encoding,
                                     bit_depth=cmp_bit_depth)
        # 2.3. Load with SciPy
        found = load_wav(tst_path, normalize=False)[0]

        # 3.1. Convert the original wav to target format with sox
        sox_encoding = _get_sox_encoding(encoding)
        sox_utils.convert_audio_file(src_path,
                                     sox_path,
                                     compression=compression,
                                     encoding=sox_encoding,
                                     bit_depth=bits_per_sample)
        # 3.2. Convert the target format to wav with sox
        sox_utils.convert_audio_file(sox_path,
                                     ref_path,
                                     encoding=cmp_encoding,
                                     bit_depth=cmp_bit_depth)
        # 3.3. Load with SciPy
        expected = load_wav(ref_path, normalize=False)[0]

        self.assertEqual(found, expected)