Пример #1
0
    def test_bytesio(self, ext, compression):
        """Saving audio to BytesIO object returns the same result as via file path."""
        sample_rate = 16000
        dtype = 'float32'
        num_channels = 2
        num_frames = 16000
        channels_first = True

        data = get_wav_data(dtype, num_channels, num_frames=num_frames)

        ref_path = self.get_temp_path(f'reference.{ext}')
        res_path = self.get_temp_path(f'test.{ext}')
        sox_io_backend.save(ref_path,
                            data,
                            channels_first=channels_first,
                            sample_rate=sample_rate,
                            compression=compression)
        fileobj = io.BytesIO()
        sox_io_backend.save(fileobj,
                            data,
                            channels_first=channels_first,
                            sample_rate=sample_rate,
                            compression=compression,
                            format=ext)
        fileobj.seek(0)
        with open(res_path, 'wb') as file_:
            file_.write(fileobj.read())

        expected_data, _ = sox_io_backend.load(ref_path)
        data, sr = sox_io_backend.load(res_path)

        assert sample_rate == sr
        self.assertEqual(expected_data, data)
Пример #2
0
 def test_load_fail(self):
     """
     When attempted to load a non-existing file, error message must contain the file path.
     """
     path = "non_existing_audio.wav"
     with self.assertRaisesRegex(
             RuntimeError,
             "^Error loading audio file: failed to open file {0}$".format(
                 path)):
         sox_io_backend.load(path)
Пример #3
0
    def assert_flac(self, sample_rate, num_channels, compression_level,
                    duration):
        """`sox_io_backend.load` can load flac format.

        This test takes the same strategy as mp3 to compare the result
        """
        path = self.get_temp_path('1.original.flac')
        ref_path = self.get_temp_path('2.reference.wav')

        # 1. Generate flac with sox
        sox_utils.gen_audio_file(path,
                                 sample_rate,
                                 num_channels,
                                 compression=compression_level,
                                 bit_depth=16,
                                 duration=duration)
        # 2. Convert to wav with sox
        sox_utils.convert_audio_file(path, ref_path)
        # 3. Load flac with torchaudio
        data, sr = sox_io_backend.load(path)
        # 4. Load wav with scipy
        data_ref = load_wav(ref_path)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
Пример #4
0
    def run_smoke_test(self,
                       ext,
                       sample_rate,
                       num_channels,
                       *,
                       compression=None,
                       dtype='float32'):
        duration = 1
        num_frames = sample_rate * duration
        path = self.get_temp_path(f'test.{ext}')
        original = get_wav_data(dtype,
                                num_channels,
                                normalize=False,
                                num_frames=num_frames)

        # 1. run save
        sox_io_backend.save(path,
                            original,
                            sample_rate,
                            compression=compression)
        # 2. run info
        info = sox_io_backend.info(path)
        assert info.sample_rate == sample_rate
        assert info.num_channels == num_channels
        # 3. run load
        loaded, sr = sox_io_backend.load(path, normalize=False)
        assert sr == sample_rate
        assert loaded.shape[0] == num_channels
Пример #5
0
    def run_smoke_test(self,
                       ext,
                       sample_rate,
                       num_channels,
                       *,
                       compression=None,
                       dtype='float32'):
        duration = 1
        num_frames = sample_rate * duration
        original = get_wav_data(dtype,
                                num_channels,
                                normalize=False,
                                num_frames=num_frames)

        fileobj = io.BytesIO()
        # 1. run save
        sox_io_backend.save(fileobj,
                            original,
                            sample_rate,
                            compression=compression,
                            format=ext)
        # 2. run info
        fileobj.seek(0)
        info = sox_io_backend.info(fileobj, format=ext)
        assert info.sample_rate == sample_rate
        assert info.num_channels == num_channels
        # 3. run load
        fileobj.seek(0)
        loaded, sr = sox_io_backend.load(fileobj, normalize=False, format=ext)
        assert sr == sample_rate
        assert loaded.shape[0] == num_channels
Пример #6
0
    def test_requests(self, ext, compression):
        sample_rate = 16000
        format_ = ext if ext in ['mp3'] else None
        audio_file = f'test.{ext}'
        audio_path = self.get_temp_path(audio_file)

        sox_utils.gen_audio_file(
            audio_path, sample_rate, num_channels=2, compression=compression)
        expected, _ = sox_io_backend.load(audio_path)

        url = self.get_url(audio_file)
        with requests.get(url, stream=True) as resp:
            found, sr = sox_io_backend.load(resp.raw, format=format_)

        assert sr == sample_rate
        self.assertEqual(expected, found)
Пример #7
0
    def assert_vorbis(self, sample_rate, num_channels, quality_level,
                      duration):
        """`sox_io_backend.load` can load vorbis format.

        This test takes the same strategy as mp3 to compare the result
        """
        path = self.get_temp_path(
            f'{sample_rate}_{num_channels}_{quality_level}_{duration}.vorbis')
        ref_path = f'{path}.wav'

        # 1. Generate vorbis with sox
        sox_utils.gen_audio_file(path,
                                 sample_rate,
                                 num_channels,
                                 compression=quality_level,
                                 bit_depth=16,
                                 duration=duration)
        # 2. Convert to wav with sox
        sox_utils.convert_audio_file(path, ref_path)
        # 3. Load vorbis with torchaudio
        data, sr = sox_io_backend.load(path)
        # 4. Load wav with scipy
        data_ref = load_wav(ref_path)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
Пример #8
0
    def assert_amb(self, dtype, sample_rate, num_channels, normalize,
                   duration):
        """`sox_io_backend.load` can load amb format.

        This test takes the same strategy as mp3 to compare the result
        """
        path = self.get_temp_path('1.original.amb')
        ref_path = self.get_temp_path('2.reference.wav')

        # 1. Generate amb with sox
        sox_utils.gen_audio_file(path,
                                 sample_rate,
                                 num_channels,
                                 encoding=sox_utils.get_encoding(dtype),
                                 bit_depth=sox_utils.get_bit_depth(dtype),
                                 duration=duration)
        # 2. Convert to wav with sox
        sox_utils.convert_audio_file(path, ref_path)
        # 3. Load amb with torchaudio
        data, sr = sox_io_backend.load(path, normalize=normalize)
        # 4. Load wav with scipy
        data_ref = load_wav(ref_path, normalize=normalize)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
Пример #9
0
    def test_fileobj(self, ext, compression):
        """Loading audio via file object returns the same result as via file path."""
        sample_rate = 16000
        format_ = ext if ext in ['mp3'] else None
        path = self.get_temp_path(f'test.{ext}')

        sox_utils.gen_audio_file(
            path, sample_rate, num_channels=2,
            compression=compression)
        expected, _ = sox_io_backend.load(path)

        with open(path, 'rb') as fileobj:
            found, sr = sox_io_backend.load(fileobj, format=format_)

        assert sr == sample_rate
        self.assertEqual(expected, found)
Пример #10
0
 def test_channels_first(self, channels_first):
     """channels_first swaps axes"""
     found, _ = sox_io_backend.load(self.path,
                                    channels_first=channels_first)
     expected = self.original if channels_first else self.original.transpose(
         1, 0)
     self.assertEqual(found, expected)
Пример #11
0
 def _make_file(self, format_):
     sample_rate = 8000
     path = self.get_temp_path(f'test.{format_}')
     sox_utils.gen_audio_file(f'{path}', sample_rate, num_channels=2)
     self.original = sox_io_backend.load(path)[0]
     self.path = os.path.splitext(path)[0]
     os.rename(path, self.path)
Пример #12
0
 def test_wav(self, dtype, sample_rate, num_channels):
     """save/load round trip should not degrade data for wav formats"""
     original = get_wav_data(dtype, num_channels, normalize=False)
     data = original
     for i in range(10):
         path = self.get_temp_path(f'{i}.wav')
         sox_io_backend.save(path, data, sample_rate)
         data, sr = sox_io_backend.load(path, normalize=False)
         assert sr == sample_rate
         self.assertEqual(original, data)
Пример #13
0
    def test_bytesio_clogged(self, ext, compression):
        """Loading audio via clogged file object returns the same result as via file path.

        This test case validates the case where fileobject returns shorter bytes than requeted.
        """
        sample_rate = 16000
        format_ = ext if ext in ['mp3'] else None
        path = self.get_temp_path(f'test.{ext}')

        sox_utils.gen_audio_file(
            path, sample_rate, num_channels=2,
            compression=compression)
        expected, _ = sox_io_backend.load(path)

        with open(path, 'rb') as file_:
            fileobj = CloggedFileObj(io.BytesIO(file_.read()))
        found, sr = sox_io_backend.load(fileobj, format=format_)

        assert sr == sample_rate
        self.assertEqual(expected, found)
Пример #14
0
    def test_opus(self, bitrate, num_channels, compression_level):
        """`sox_io_backend.load` can load opus file correctly."""
        ops_path = get_asset_path('io', f'{bitrate}_{compression_level}_{num_channels}ch.opus')
        wav_path = self.get_temp_path(f'{bitrate}_{compression_level}_{num_channels}ch.opus.wav')
        sox_utils.convert_audio_file(ops_path, wav_path)

        expected, sample_rate = load_wav(wav_path)
        found, sr = sox_io_backend.load(ops_path)

        assert sample_rate == sr
        self.assertEqual(expected, found)
Пример #15
0
    def test_tarfile(self, ext, compression):
        """Loading compressed audio via file-like object returns the same result as via file path."""
        sample_rate = 16000
        format_ = ext if ext in ['mp3'] else None
        audio_file = f'test.{ext}'
        audio_path = self.get_temp_path(audio_file)
        archive_path = self.get_temp_path('archive.tar.gz')

        sox_utils.gen_audio_file(
            audio_path, sample_rate, num_channels=2,
            compression=compression)
        expected, _ = sox_io_backend.load(audio_path)

        with tarfile.TarFile(archive_path, 'w') as tarobj:
            tarobj.add(audio_path, arcname=audio_file)
        with tarfile.TarFile(archive_path, 'r') as tarobj:
            fileobj = tarobj.extractfile(audio_file)
            found, sr = sox_io_backend.load(fileobj, format=format_)

        assert sr == sample_rate
        self.assertEqual(expected, found)
Пример #16
0
    def assert_wav(self, dtype, sample_rate, num_channels, normalize, duration):
        """`sox_io_backend.load` can load wav format correctly.

        Wav data loaded with sox_io backend should match those with scipy
        """
        path = self.get_temp_path('reference.wav')
        data = get_wav_data(dtype, num_channels, normalize=normalize, num_frames=duration * sample_rate)
        save_wav(path, data, sample_rate)
        expected = load_wav(path, normalize=normalize)[0]
        data, sr = sox_io_backend.load(path, normalize=normalize)
        assert sr == sample_rate
        self.assertEqual(data, expected)
Пример #17
0
    def test_mp3(self):
        """Providing format allows to read mp3 without extension

        libsox does not check header for mp3

        https://github.com/pytorch/audio/issues/1040

        The file was generated with the following command
            ffmpeg -f lavfi -i "sine=frequency=1000:duration=5" -ar 16000 -f mp3 test_noext
        """
        path = get_asset_path("mp3_without_ext")
        _, sr = sox_io_backend.load(path, format="mp3")
        assert sr == 16000
Пример #18
0
 def test_flac(self, sample_rate, num_channels, compression_level):
     """save/load round trip should not degrade data for flac formats"""
     original = get_wav_data('float32', num_channels)
     data = original
     for i in range(10):
         path = self.get_temp_path(f'{i}.flac')
         sox_io_backend.save(path,
                             data,
                             sample_rate,
                             compression=compression_level)
         data, sr = sox_io_backend.load(path)
         assert sr == sample_rate
         self.assertEqual(original, data)
Пример #19
0
    def __init__(self, path, streams, img_size):

        # Having this helper index makes everything go incredibly fast
        self.curr_index = -1

        # To ensure playback and whatnot we reencode to just audio using the native ffmpeg

        os.system(
            f'ffmpeg -n -i {path} -acodec pcm_s16le -ar 44100 {path}.wav')

        # Video loading is very simple

        self.video_reader = torchvision.io.VideoReader(path, 'video')
        self.visual_info = self.video_reader.get_metadata()['video']

        # Since in general, video fps <<< audio fps, we want to sync the audio to the video.
        # So the audio initialization is more complicated.

        # Recall that samples/second * (second/frame) = samples/frame
        self.audio_info = {
            'sample_rate': sox_io.info(path + '.wav').sample_rate,
            'num_frames': sox_io.info(path + '.wav').num_frames,
            'num_channels': sox_io.info(path + '.wav').num_channels
        }

        self.a_v_ratio = int(self.audio_info['sample_rate'] /
                             self.visual_info['fps'][0])
        self.audio_reader = lambda index: sox_io.load(path + '.wav',
                                                      index * self.a_v_ratio,
                                                      self.a_v_ratio,
                                                      normalize=True)

        # Wrapper to make the iteration much more simple
        # TODO: Put this in a function

        self.resize = torchvision.transforms.Resize(img_size)

        if streams == 'audioaudio':
            self.streams = self.audio_streamer

        if streams == 'videovideo':
            self.streams = self.video_streamer

        if streams == 'audiovideo':
            self.streams = self.audio_video_streamer

        if streams == 'videoaudio':
            self.streams = self.video_audio_streamer
Пример #20
0
    def test_frame(self, frame_offset, num_frames):
        """num_frames and frame_offset correctly specify the region of data"""
        sample_rate = 8000
        audio_file = 'test.wav'
        audio_path = self.get_temp_path(audio_file)

        original = get_wav_data('float32', num_channels=2)
        save_wav(audio_path, original, sample_rate)
        frame_end = None if num_frames == -1 else frame_offset + num_frames
        expected = original[:, frame_offset:frame_end]

        url = self.get_url(audio_file)
        with requests.get(url, stream=True) as resp:
            found, sr = sox_io_backend.load(resp.raw, frame_offset, num_frames)

        assert sr == sample_rate
        self.assertEqual(expected, found)
Пример #21
0
    def assert_mp3(self, sample_rate, num_channels, bit_rate, duration):
        """`sox_io_backend.load` can load mp3 format.

        mp3 encoding introduces delay and boundary effects so
        we create reference wav file from mp3

         x
         |
         | 1. Generate mp3 with Sox
         |
         v    2. Convert to wav with Sox
        mp3 ------------------------------> wav
         |                                   |
         | 3. Load with torchaudio           | 4. Load with scipy
         |                                   |
         v                                   v
        tensor ----------> x <----------- tensor
                       5. Compare

        Underlying assumptions are;
        i. Conversion of mp3 to wav with Sox preserves data.
        ii. Loading wav file with scipy is correct.

        By combining i & ii, step 2. and 4. allows to load reference mp3 data
        without using torchaudio
        """
        path = self.get_temp_path(
            f'{sample_rate}_{num_channels}_{bit_rate}_{duration}.mp3')
        ref_path = f'{path}.wav'

        # 1. Generate mp3 with sox
        sox_utils.gen_audio_file(path,
                                 sample_rate,
                                 num_channels,
                                 compression=bit_rate,
                                 duration=duration)
        # 2. Convert to wav with sox
        sox_utils.convert_audio_file(path, ref_path)
        # 3. Load mp3 with torchaudio
        data, sr = sox_io_backend.load(path)
        # 4. Load wav with scipy
        data_ref = load_wav(ref_path)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=3e-03, rtol=1.3e-06)
Пример #22
0
    def assert_24bit_wav(self, sample_rate, num_channels, normalize, duration):
        """ `sox_io_backend.load` can load 24-bit signed PCM wav format. Since torch does not support the ``int24`` dtype,
        we implicitly cast the resulting tensor to the ``int32`` dtype.

        It is not possible to use #assert_wav method above, as #get_wav_data does not support
        the 'int24' dtype. This is because torch does not support the ``int24`` dtype.
        Hence, we must use the following workaround.

         x
         |
         |    1. Generate 24-bit wav with Sox.
         |
         v    2. Convert 24-bit wav to 32-bit wav with Sox.
      wav(24-bit) ----------------------> wav(32-bit)
         |                                   |
         | 3. Load 24-bit wav with torchaudio| 4. Load 32-bit wav with scipy
         |                                   |
         v                                   v
        tensor ----------> x <----------- tensor
                       5. Compare

        # Underlying assumptions are:
        # i. Sox properly converts from 24-bit to 32-bit
        # ii. Loading 32-bit wav file with scipy is correct.
        """
        path = self.get_temp_path('1.original.wav')
        ref_path = self.get_temp_path('2.reference.wav')

        # 1. Generate 24-bit signed wav with Sox
        sox_utils.gen_audio_file(path,
                                 sample_rate,
                                 num_channels,
                                 bit_depth=24,
                                 duration=duration)

        # 2. Convert from 24-bit wav to 32-bit wav with sox
        sox_utils.convert_audio_file(path, ref_path, bit_depth=32)
        # 3. Load 24-bit wav with torchaudio
        data, sr = sox_io_backend.load(path, normalize=normalize)
        # 4. Load 32-bit wav with scipy
        data_ref = load_wav(ref_path, normalize=normalize)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=3e-03, rtol=1.3e-06)
Пример #23
0
def write_tar_file(data_list,
                   no_segments,
                   tar_file,
                   resample=16000,
                   index=0,
                   total=1):
    logging.info('Processing {} {}/{}'.format(tar_file, index, total))
    read_time = 0.0
    save_time = 0.0
    write_time = 0.0
    with tarfile.open(tar_file, "w") as tar:
        prev_wav = None
        for item in data_list:
            if no_segments:
                key, txt, wav = item
            else:
                key, txt, wav, start, end = item

            suffix = wav.split('.')[-1]
            assert suffix in AUDIO_FORMAT_SETS
            if no_segments:
                ts = time.time()
                with open(wav, 'rb') as fin:
                    data = fin.read()
                read_time += (time.time() - ts)
            else:
                if wav != prev_wav:
                    ts = time.time()
                    waveforms, sample_rate = sox.load(wav, normalize=False)
                    read_time += (time.time() - ts)
                    prev_wav = wav
                start = int(start * sample_rate)
                end = int(end * sample_rate)
                audio = waveforms[:1, start:end]

                # resample
                if sample_rate != resample:
                    if not audio.is_floating_point():
                        # normalize the audio before resample
                        # because resample can't process int audio
                        audio = audio / (1 << 15)
                        audio = torchaudio.transforms.Resample(
                            sample_rate, resample)(audio)
                        audio = (audio * (1 << 15)).short()
                    else:
                        audio = torchaudio.transforms.Resample(
                            sample_rate, resample)(audio)

                ts = time.time()
                f = io.BytesIO()
                sox.save(f, audio, resample, format="wav", bits_per_sample=16)
                # Save to wav for segments file
                suffix = "wav"
                f.seek(0)
                data = f.read()
                save_time += (time.time() - ts)

            assert isinstance(txt, str)
            ts = time.time()
            txt_file = key + '.txt'
            txt = txt.encode('utf8')
            txt_data = io.BytesIO(txt)
            txt_info = tarfile.TarInfo(txt_file)
            txt_info.size = len(txt)
            tar.addfile(txt_info, txt_data)

            wav_file = key + '.' + suffix
            wav_data = io.BytesIO(data)
            wav_info = tarfile.TarInfo(wav_file)
            wav_info.size = len(data)
            tar.addfile(wav_info, wav_data)
            write_time += (time.time() - ts)
        logging.info('read {} save {} write {}'.format(read_time, save_time,
                                                       write_time))
Пример #24
0
    def assert_format(
        self,
        format: str,
        sample_rate: float,
        num_channels: int,
        compression: float = None,
        bit_depth: int = None,
        duration: float = 1,
        normalize: bool = True,
        encoding: str = None,
        atol: float = 4e-05,
        rtol: float = 1.3e-06,
    ):
        """`sox_io_backend.load` can load given format correctly.

        file encodings introduce delay and boundary effects so
        we create a reference wav file from the original file format

         x
         |
         |    1. Generate given format with Sox
         |
         v    2. Convert to wav with Sox
        given format ----------------------> wav
         |                                   |
         |    3. Load with torchaudio        | 4. Load with scipy
         |                                   |
         v                                   v
        tensor ----------> x <----------- tensor
                       5. Compare

        Underlying assumptions are;
        i. Conversion of given format to wav with Sox preserves data.
        ii. Loading wav file with scipy is correct.

        By combining i & ii, step 2. and 4. allows to load reference given format
        data without using torchaudio
        """

        path = self.get_temp_path(f'1.original.{format}')
        ref_path = self.get_temp_path('2.reference.wav')

        # 1. Generate the given format with sox
        sox_utils.gen_audio_file(
            path,
            sample_rate,
            num_channels,
            encoding=encoding,
            compression=compression,
            bit_depth=bit_depth,
            duration=duration,
        )
        # 2. Convert to wav with sox
        wav_bit_depth = 32 if bit_depth == 24 else None  # for 24-bit wav
        sox_utils.convert_audio_file(path, ref_path, bit_depth=wav_bit_depth)
        # 3. Load the given format with torchaudio
        data, sr = sox_io_backend.load(path, normalize=normalize)
        # 4. Load wav with scipy
        data_ref = load_wav(ref_path, normalize=normalize)[0]
        # 5. Compare
        assert sr == sample_rate
        self.assertEqual(data, data_ref, atol=atol, rtol=rtol)
Пример #25
0
 def test_format(self, format_):
     """Providing format allows to read file without extension"""
     self._make_file(format_)
     found, _ = sox_io_backend.load(self.path)
     self.assertEqual(found, self.original)
Пример #26
0
def py_load_func(filepath: str, normalize: bool, channels_first: bool):
    return sox_io_backend.load(filepath,
                               normalize=normalize,
                               channels_first=channels_first)
Пример #27
0
 def test_frame(self, frame_offset, num_frames):
     """num_frames and frame_offset correctly specify the region of data"""
     found, _ = sox_io_backend.load(self.path, frame_offset, num_frames)
     frame_end = None if num_frames == -1 else frame_offset + num_frames
     self.assertEqual(found, self.original[:, frame_offset:frame_end])