Exemplo n.º 1
0
    def assert_amb(self, dtype, sample_rate, num_channels, duration):
        """`sox_io_backend.save` can save amb format.

        This test takes the same strategy as mp3 to compare the result
        """
        src_path = self.get_temp_path('1.reference.wav')
        amb_path = self.get_temp_path('2.1.torchaudio.amb')
        wav_path = self.get_temp_path('2.2.torchaudio.wav')
        amb_path_sox = self.get_temp_path('3.1.sox.amb')
        wav_path_sox = self.get_temp_path('3.2.sox.wav')

        # 1. Generate original wav
        data = get_wav_data(dtype,
                            num_channels,
                            normalize=False,
                            num_frames=duration * sample_rate)
        save_wav(src_path, data, sample_rate)
        # 2.1. Convert the original wav to amb with torchaudio
        sox_io_backend.save(amb_path,
                            load_wav(src_path, normalize=False)[0],
                            sample_rate)
        # 2.2. Convert the amb to wav with Sox
        sox_utils.convert_audio_file(amb_path, wav_path)
        # 2.3. Load
        found = load_wav(wav_path)[0]

        # 3.1. Convert the original wav to amb with SoX
        sox_utils.convert_audio_file(src_path, amb_path_sox)
        # 3.2. Convert the amb to wav with Sox
        sox_utils.convert_audio_file(amb_path_sox, wav_path_sox)
        # 3.3. Load
        expected = load_wav(wav_path_sox)[0]

        self.assertEqual(found, expected)
Exemplo n.º 2
0
    def assert_mp3(self, sample_rate, num_channels, bit_rate, duration):
        """`sox_io_backend.save` can save mp3 format.

        mp3 encoding introduces delay and boundary effects so
        we convert the resulting mp3 to wav and compare the results there

                          |
                          | 1. Generate original wav file with SciPy
                          |
                          v
          -------------- wav ----------------
         |                                   |
         | 2.1. load with scipy              | 3.1. Convert to mp3 with Sox
         | then save with torchaudio         |
         v                                   v
        mp3                                 mp3
         |                                   |
         | 2.2. Convert to wav with Sox      | 3.2. Convert to wav with Sox
         |                                   |
         v                                   v
        wav                                 wav
         |                                   |
         | 2.3. load with scipy              | 3.3. load with scipy
         |                                   |
         v                                   v
        tensor -------> compare <--------- tensor

        """
        src_path = self.get_temp_path('1.reference.wav')
        mp3_path = self.get_temp_path('2.1.torchaudio.mp3')
        wav_path = self.get_temp_path('2.2.torchaudio.wav')
        mp3_path_sox = self.get_temp_path('3.1.sox.mp3')
        wav_path_sox = self.get_temp_path('3.2.sox.wav')

        # 1. Generate original wav
        data = get_wav_data('float32',
                            num_channels,
                            normalize=True,
                            num_frames=duration * sample_rate)
        save_wav(src_path, data, sample_rate)
        # 2.1. Convert the original wav to mp3 with torchaudio
        sox_io_backend.save(mp3_path,
                            load_wav(src_path)[0],
                            sample_rate,
                            compression=bit_rate)
        # 2.2. Convert the mp3 to wav with Sox
        sox_utils.convert_audio_file(mp3_path, wav_path)
        # 2.3. Load
        found = load_wav(wav_path)[0]

        # 3.1. Convert the original wav to mp3 with SoX
        sox_utils.convert_audio_file(src_path,
                                     mp3_path_sox,
                                     compression=bit_rate)
        # 3.2. Convert the mp3 to wav with Sox
        sox_utils.convert_audio_file(mp3_path_sox, wav_path_sox)
        # 3.3. Load
        expected = load_wav(wav_path_sox)[0]

        self.assertEqual(found, expected)
Exemplo n.º 3
0
    def assert_sphere(self, sample_rate, num_channels, duration):
        """`sox_io_backend.save` can save sph format.

        This test takes the same strategy as mp3 to compare the result
        """
        src_path = self.get_temp_path('1.reference.wav')
        flc_path = self.get_temp_path('2.1.torchaudio.sph')
        wav_path = self.get_temp_path('2.2.torchaudio.wav')
        flc_path_sox = self.get_temp_path('3.1.sox.sph')
        wav_path_sox = self.get_temp_path('3.2.sox.wav')

        # 1. Generate original wav
        data = get_wav_data('float32',
                            num_channels,
                            normalize=True,
                            num_frames=duration * sample_rate)
        save_wav(src_path, data, sample_rate)
        # 2.1. Convert the original wav to sph with torchaudio
        sox_io_backend.save(flc_path, load_wav(src_path)[0], sample_rate)
        # 2.2. Convert the sph to wav with Sox
        # converting to 32 bit because sph file has 24 bit depth which scipy cannot handle.
        sox_utils.convert_audio_file(flc_path, wav_path, bit_depth=32)
        # 2.3. Load
        found = load_wav(wav_path)[0]

        # 3.1. Convert the original wav to sph with SoX
        sox_utils.convert_audio_file(src_path, flc_path_sox)
        # 3.2. Convert the sph to wav with Sox
        # converting to 32 bit because sph file has 24 bit depth which scipy cannot handle.
        sox_utils.convert_audio_file(flc_path_sox, wav_path_sox, bit_depth=32)
        # 3.3. Load
        expected = load_wav(wav_path_sox)[0]

        self.assertEqual(found, expected)
Exemplo n.º 4
0
    def run_smoke_test(self,
                       ext,
                       sample_rate,
                       num_channels,
                       *,
                       compression=None,
                       dtype='float32'):
        duration = 1
        num_frames = sample_rate * duration
        original = get_wav_data(dtype,
                                num_channels,
                                normalize=False,
                                num_frames=num_frames)

        fileobj = io.BytesIO()
        # 1. run save
        sox_io_backend.save(fileobj,
                            original,
                            sample_rate,
                            compression=compression,
                            format=ext)
        # 2. run info
        fileobj.seek(0)
        info = sox_io_backend.info(fileobj, format=ext)
        assert info.sample_rate == sample_rate
        assert info.num_channels == num_channels
        # 3. run load
        fileobj.seek(0)
        loaded, sr = sox_io_backend.load(fileobj, normalize=False, format=ext)
        assert sr == sample_rate
        assert loaded.shape[0] == num_channels
Exemplo n.º 5
0
    def run_smoke_test(self,
                       ext,
                       sample_rate,
                       num_channels,
                       *,
                       compression=None,
                       dtype='float32'):
        duration = 1
        num_frames = sample_rate * duration
        path = self.get_temp_path(f'test.{ext}')
        original = get_wav_data(dtype,
                                num_channels,
                                normalize=False,
                                num_frames=num_frames)

        # 1. run save
        sox_io_backend.save(path,
                            original,
                            sample_rate,
                            compression=compression)
        # 2. run info
        info = sox_io_backend.info(path)
        assert info.sample_rate == sample_rate
        assert info.num_channels == num_channels
        # 3. run load
        loaded, sr = sox_io_backend.load(path, normalize=False)
        assert sr == sample_rate
        assert loaded.shape[0] == num_channels
Exemplo n.º 6
0
    def test_bytesio(self, ext, compression):
        """Saving audio to BytesIO object returns the same result as via file path."""
        sample_rate = 16000
        dtype = 'float32'
        num_channels = 2
        num_frames = 16000
        channels_first = True

        data = get_wav_data(dtype, num_channels, num_frames=num_frames)

        ref_path = self.get_temp_path(f'reference.{ext}')
        res_path = self.get_temp_path(f'test.{ext}')
        sox_io_backend.save(ref_path,
                            data,
                            channels_first=channels_first,
                            sample_rate=sample_rate,
                            compression=compression)
        fileobj = io.BytesIO()
        sox_io_backend.save(fileobj,
                            data,
                            channels_first=channels_first,
                            sample_rate=sample_rate,
                            compression=compression,
                            format=ext)
        fileobj.seek(0)
        with open(res_path, 'wb') as file_:
            file_.write(fileobj.read())

        expected_data, _ = sox_io_backend.load(ref_path)
        data, sr = sox_io_backend.load(res_path)

        assert sample_rate == sr
        self.assertEqual(expected_data, data)
Exemplo n.º 7
0
 def test_noncontiguous(self, dtype):
     """Noncontiguous tensors are saved correctly"""
     path = self.get_temp_path('data.wav')
     expected = get_wav_data(dtype, 4)[::2, ::2]
     assert not expected.is_contiguous()
     sox_io_backend.save(path, expected, 8000)
     found = load_wav(path)[0]
     self.assertEqual(found, expected)
Exemplo n.º 8
0
 def assert_wav(self, dtype, sample_rate, num_channels, num_frames):
     """`sox_io_backend.save` can save wav format."""
     path = self.get_temp_path('data.wav')
     expected = get_wav_data(dtype, num_channels, num_frames=num_frames)
     sox_io_backend.save(path, expected, sample_rate)
     found, sr = load_wav(path)
     assert sample_rate == sr
     self.assertEqual(found, expected)
Exemplo n.º 9
0
 def test_dtype_conversion(self, dtype, expected):
     """`save` performs dtype conversion on float32 src tensors only."""
     path = self.get_temp_path("data.wav")
     data = torch.tensor([-1.0, -0.5, 0, 0.5,
                          1.0]).to(torch.float32).view(-1, 1)
     sox_io_backend.save(path, data, 8000, dtype=dtype)
     found = load_wav(path, normalize=False)[0]
     self.assertEqual(found, expected.view(-1, 1))
Exemplo n.º 10
0
 def test_channels_first(self, channels_first):
     """channels_first swaps axes"""
     path = self.get_temp_path('data.wav')
     data = get_wav_data('int32', 2, channels_first=channels_first)
     sox_io_backend.save(path, data, 8000, channels_first=channels_first)
     found = load_wav(path)[0]
     expected = data if channels_first else data.transpose(1, 0)
     self.assertEqual(found, expected)
Exemplo n.º 11
0
    def test_tensor_preserve(self, dtype):
        """save function should not alter Tensor"""
        path = self.get_temp_path('data.wav')
        expected = get_wav_data(dtype, 4)[::2, ::2]

        data = expected.clone()
        sox_io_backend.save(path, data, 8000)

        self.assertEqual(data, expected)
Exemplo n.º 12
0
 def test_save_noncontiguous(self, dtype):
     """Noncontiguous tensors are saved correctly"""
     path = self.get_temp_path('data.wav')
     enc, bps = get_enc_params(dtype)
     expected = get_wav_data(dtype, 4, normalize=False)[::2, ::2]
     assert not expected.is_contiguous()
     sox_io_backend.save(
         path, expected, 8000, encoding=enc, bits_per_sample=bps)
     found = load_wav(path, normalize=False)[0]
     self.assertEqual(found, expected)
Exemplo n.º 13
0
 def test_save_fail(self):
     """
     When attempted to save into a non-existing dir, error message must contain the file path.
     """
     path = os.path.join("non_existing_directory", "foo.wav")
     with self.assertRaisesRegex(
             RuntimeError,
             "^Error saving audio file: failed to open file {0}$".format(
                 path)):
         sox_io_backend.save(path, torch.zeros(1, 1), 8000)
Exemplo n.º 14
0
 def test_wav(self, dtype, sample_rate, num_channels):
     """save/load round trip should not degrade data for wav formats"""
     original = get_wav_data(dtype, num_channels, normalize=False)
     data = original
     for i in range(10):
         path = self.get_temp_path(f'{i}.wav')
         sox_io_backend.save(path, data, sample_rate)
         data, sr = sox_io_backend.load(path, normalize=False)
         assert sr == sample_rate
         self.assertEqual(original, data)
Exemplo n.º 15
0
 def test_flac(self, sample_rate, num_channels, compression_level):
     """save/load round trip should not degrade data for flac formats"""
     original = get_wav_data('float32', num_channels)
     data = original
     for i in range(10):
         path = self.get_temp_path(f'{i}.flac')
         sox_io_backend.save(path,
                             data,
                             sample_rate,
                             compression=compression_level)
         data, sr = sox_io_backend.load(path)
         assert sr == sample_rate
         self.assertEqual(original, data)
Exemplo n.º 16
0
    def _assert_vorbis(self, sample_rate, num_channels, quality_level,
                       duration):
        """`sox_io_backend.save` can save vorbis format.

        This test takes the same strategy as mp3 to compare the result
        """
        src_path = self.get_temp_path('1.reference.wav')
        vbs_path = self.get_temp_path('2.1.torchaudio.vorbis')
        wav_path = self.get_temp_path('2.2.torchaudio.wav')
        vbs_path_sox = self.get_temp_path('3.1.sox.vorbis')
        wav_path_sox = self.get_temp_path('3.2.sox.wav')

        # 1. Generate original wav
        data = get_wav_data('int16',
                            num_channels,
                            normalize=False,
                            num_frames=duration * sample_rate)
        save_wav(src_path, data, sample_rate)
        # 2.1. Convert the original wav to vorbis with torchaudio
        sox_io_backend.save(vbs_path,
                            load_wav(src_path)[0],
                            sample_rate,
                            compression=quality_level,
                            dtype=None)
        # 2.2. Convert the vorbis to wav with Sox
        sox_utils.convert_audio_file(vbs_path, wav_path)
        # 2.3. Load
        found = load_wav(wav_path)[0]

        # 3.1. Convert the original wav to vorbis with SoX
        sox_utils.convert_audio_file(src_path,
                                     vbs_path_sox,
                                     compression=quality_level)
        # 3.2. Convert the vorbis to wav with Sox
        sox_utils.convert_audio_file(vbs_path_sox, wav_path_sox)
        # 3.3. Load
        expected = load_wav(wav_path_sox)[0]

        # sox's vorbis encoding has some random boundary effect, which cause small number of
        # samples yields higher descrepency than the others.
        # so we allow small portions of data to be outside of absolute torelance.
        # make sure to pass somewhat long duration
        atol = 1.0e-4
        max_failure_allowed = 0.01  # this percent of samples are allowed to outside of atol.
        failure_ratio = (
            (found - expected).abs() > atol).sum().item() / found.numel()
        if failure_ratio > max_failure_allowed:
            # it's failed and this will give a better error message.
            self.assertEqual(found, expected, atol=atol, rtol=1.3e-6)
Exemplo n.º 17
0
def write_tar_file(data_list,
                   no_segments,
                   tar_file,
                   resample=16000,
                   index=0,
                   total=1):
    logging.info('Processing {} {}/{}'.format(tar_file, index, total))
    read_time = 0.0
    save_time = 0.0
    write_time = 0.0
    with tarfile.open(tar_file, "w") as tar:
        prev_wav = None
        for item in data_list:
            if no_segments:
                key, txt, wav = item
            else:
                key, txt, wav, start, end = item

            suffix = wav.split('.')[-1]
            assert suffix in AUDIO_FORMAT_SETS
            if no_segments:
                ts = time.time()
                with open(wav, 'rb') as fin:
                    data = fin.read()
                read_time += (time.time() - ts)
            else:
                if wav != prev_wav:
                    ts = time.time()
                    waveforms, sample_rate = sox.load(wav, normalize=False)
                    read_time += (time.time() - ts)
                    prev_wav = wav
                start = int(start * sample_rate)
                end = int(end * sample_rate)
                audio = waveforms[:1, start:end]

                # resample
                if sample_rate != resample:
                    if not audio.is_floating_point():
                        # normalize the audio before resample
                        # because resample can't process int audio
                        audio = audio / (1 << 15)
                        audio = torchaudio.transforms.Resample(
                            sample_rate, resample)(audio)
                        audio = (audio * (1 << 15)).short()
                    else:
                        audio = torchaudio.transforms.Resample(
                            sample_rate, resample)(audio)

                ts = time.time()
                f = io.BytesIO()
                sox.save(f, audio, resample, format="wav", bits_per_sample=16)
                # Save to wav for segments file
                suffix = "wav"
                f.seek(0)
                data = f.read()
                save_time += (time.time() - ts)

            assert isinstance(txt, str)
            ts = time.time()
            txt_file = key + '.txt'
            txt = txt.encode('utf8')
            txt_data = io.BytesIO(txt)
            txt_info = tarfile.TarInfo(txt_file)
            txt_info.size = len(txt)
            tar.addfile(txt_info, txt_data)

            wav_file = key + '.' + suffix
            wav_data = io.BytesIO(data)
            wav_info = tarfile.TarInfo(wav_file)
            wav_info.size = len(data)
            tar.addfile(wav_info, wav_data)
            write_time += (time.time() - ts)
        logging.info('read {} save {} write {}'.format(read_time, save_time,
                                                       write_time))
Exemplo n.º 18
0
    def assert_save_consistency(
        self,
        format: str,
        *,
        compression: float = None,
        encoding: str = None,
        bits_per_sample: int = None,
        sample_rate: float = 8000,
        num_channels: int = 2,
        num_frames: float = 3 * 8000,
        src_dtype: str = 'int32',
        test_mode: str = "path",
    ):
        """`save` function produces file that is comparable with `sox` command

        To compare that the file produced by `save` function agains the file produced by
        the equivalent `sox` command, we need to load both files.
        But there are many formats that cannot be opened with common Python modules (like
        SciPy).
        So we use `sox` command to prepare the original data and convert the saved files
        into a format that SciPy can read (PCM wav).
        The following diagram illustrates this process. The difference is 2.1. and 3.1.

        This assumes that
         - loading data with SciPy preserves the data well.
         - converting the resulting files into WAV format with `sox` preserve the data well.

                          x
                          | 1. Generate source wav file with SciPy
                          |
                          v
          -------------- wav ----------------
         |                                   |
         | 2.1. load with scipy              | 3.1. Convert to the target
         |   then save it into the target    |      format depth with sox
         |   format with torchaudio          |
         v                                   v
        target format                       target format
         |                                   |
         | 2.2. Convert to wav with sox      | 3.2. Convert to wav with sox
         |                                   |
         v                                   v
        wav                                 wav
         |                                   |
         | 2.3. load with scipy              | 3.3. load with scipy
         |                                   |
         v                                   v
        tensor -------> compare <--------- tensor

        """
        cmp_encoding = 'floating-point'
        cmp_bit_depth = 32

        src_path = self.get_temp_path('1.source.wav')
        tgt_path = self.get_temp_path(f'2.1.torchaudio.{format}')
        tst_path = self.get_temp_path('2.2.result.wav')
        sox_path = self.get_temp_path(f'3.1.sox.{format}')
        ref_path = self.get_temp_path('3.2.ref.wav')

        # 1. Generate original wav
        data = get_wav_data(src_dtype,
                            num_channels,
                            normalize=False,
                            num_frames=num_frames)
        save_wav(src_path, data, sample_rate)

        # 2.1. Convert the original wav to target format with torchaudio
        data = load_wav(src_path, normalize=False)[0]
        if test_mode == "path":
            sox_io_backend.save(tgt_path,
                                data,
                                sample_rate,
                                compression=compression,
                                encoding=encoding,
                                bits_per_sample=bits_per_sample)
        elif test_mode == "fileobj":
            with open(tgt_path, 'bw') as file_:
                sox_io_backend.save(file_,
                                    data,
                                    sample_rate,
                                    format=format,
                                    compression=compression,
                                    encoding=encoding,
                                    bits_per_sample=bits_per_sample)
        elif test_mode == "bytesio":
            file_ = io.BytesIO()
            sox_io_backend.save(file_,
                                data,
                                sample_rate,
                                format=format,
                                compression=compression,
                                encoding=encoding,
                                bits_per_sample=bits_per_sample)
            file_.seek(0)
            with open(tgt_path, 'bw') as f:
                f.write(file_.read())
        else:
            raise ValueError(f"Unexpected test mode: {test_mode}")
        # 2.2. Convert the target format to wav with sox
        sox_utils.convert_audio_file(tgt_path,
                                     tst_path,
                                     encoding=cmp_encoding,
                                     bit_depth=cmp_bit_depth)
        # 2.3. Load with SciPy
        found = load_wav(tst_path, normalize=False)[0]

        # 3.1. Convert the original wav to target format with sox
        sox_encoding = _get_sox_encoding(encoding)
        sox_utils.convert_audio_file(src_path,
                                     sox_path,
                                     compression=compression,
                                     encoding=sox_encoding,
                                     bit_depth=bits_per_sample)
        # 3.2. Convert the target format to wav with sox
        sox_utils.convert_audio_file(sox_path,
                                     ref_path,
                                     encoding=cmp_encoding,
                                     bit_depth=cmp_bit_depth)
        # 3.3. Load with SciPy
        expected = load_wav(ref_path, normalize=False)[0]

        self.assertEqual(found, expected)