def test_save_flac(self, sample_rate, num_channels, compression_level): ts_save_func = torch_script(py_save_func) expected = get_wav_data('float32', num_channels) py_path = self.get_temp_path( f'test_save_py_{sample_rate}_{num_channels}_{compression_level}.flac' ) ts_path = self.get_temp_path( f'test_save_ts_{sample_rate}_{num_channels}_{compression_level}.flac' ) py_save_func(py_path, expected, sample_rate, True, compression_level, None, None) ts_save_func(ts_path, expected, sample_rate, True, compression_level, None, None) # converting to 32 bit because flac file has 24 bit depth which scipy cannot handle. py_path_wav = f'{py_path}.wav' ts_path_wav = f'{ts_path}.wav' sox_utils.convert_audio_file(py_path, py_path_wav, bit_depth=32) sox_utils.convert_audio_file(ts_path, ts_path_wav, bit_depth=32) py_data, py_sr = load_wav(py_path_wav, normalize=True) ts_data, ts_sr = load_wav(ts_path_wav, normalize=True) self.assertEqual(sample_rate, py_sr) self.assertEqual(sample_rate, ts_sr) self.assertEqual(expected, py_data) self.assertEqual(expected, ts_data)
def assert_amr_nb(self, duration): """`sox_io_backend.load` can load amr-nb format. This test takes the same strategy as mp3 to compare the result """ sample_rate = 8000 num_channels = 1 path = self.get_temp_path('1.original.amr-nb') ref_path = self.get_temp_path('2.reference.wav') # 1. Generate amr-nb with sox sox_utils.gen_audio_file(path, sample_rate, num_channels, bit_depth=32, duration=duration) # 2. Convert to wav with sox sox_utils.convert_audio_file(path, ref_path) # 3. Load amr-nb with torchaudio data, sr = sox_io_backend.load(path) # 4. Load wav with scipy data_ref = load_wav(ref_path)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
def assert_vorbis(self, sample_rate, num_channels, quality_level, duration): """`sox_io_backend.load` can load vorbis format. This test takes the same strategy as mp3 to compare the result """ path = self.get_temp_path('1.original.vorbis') ref_path = self.get_temp_path('2.reference.wav') # 1. Generate vorbis with sox sox_utils.gen_audio_file(path, sample_rate, num_channels, compression=quality_level, bit_depth=16, duration=duration) # 2. Convert to wav with sox sox_utils.convert_audio_file(path, ref_path) # 3. Load vorbis with torchaudio data, sr = sox_io_backend.load(path) # 4. Load wav with scipy data_ref = load_wav(ref_path)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=4e-05, rtol=1.3e-06)
def test_opus(self, bitrate, num_channels, compression_level): """`sox_io_backend.load` can load opus file correctly.""" ops_path = get_asset_path('io', f'{bitrate}_{compression_level}_{num_channels}ch.opus') wav_path = self.get_temp_path(f'{bitrate}_{compression_level}_{num_channels}ch.opus.wav') sox_utils.convert_audio_file(ops_path, wav_path) expected, sample_rate = load_wav(wav_path) found, sr = sox_io_backend.load(ops_path) assert sample_rate == sr self.assertEqual(expected, found)
def assert_mp3(self, sample_rate, num_channels, bit_rate, duration): """`sox_io_backend.load` can load mp3 format. mp3 encoding introduces delay and boundary effects so we create reference wav file from mp3 x | | 1. Generate mp3 with Sox | v 2. Convert to wav with Sox mp3 ------------------------------> wav | | | 3. Load with torchaudio | 4. Load with scipy | | v v tensor ----------> x <----------- tensor 5. Compare Underlying assumptions are: i. Conversion of mp3 to wav with Sox preserves data. ii. Loading wav file with scipy is correct. By combining i & ii, step 2. and 4. allows to load reference mp3 data without using torchaudio """ path = self.get_temp_path('1.original.mp3') ref_path = self.get_temp_path('2.reference.wav') # 1. Generate mp3 with sox sox_utils.gen_audio_file(path, sample_rate, num_channels, compression=bit_rate, duration=duration) # 2. Convert to wav with sox sox_utils.convert_audio_file(path, ref_path) # 3. Load mp3 with torchaudio data, sr = sox_io_backend.load(path) # 4. Load wav with scipy data_ref = load_wav(ref_path)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=3e-03, rtol=1.3e-06)
def assert_24bit_wav(self, sample_rate, num_channels, normalize, duration): """ `sox_io_backend.load` can load 24-bit signed PCM wav format. Since torch does not support the ``int24`` dtype, we implicitly cast the resulting tensor to the ``int32`` dtype. It is not possible to use #assert_wav method above, as #get_wav_data does not support the 'int24' dtype. This is because torch does not support the ``int24`` dtype. Hence, we must use the following workaround. x | | 1. Generate 24-bit wav with Sox. | v 2. Convert 24-bit wav to 32-bit wav with Sox. wav(24-bit) ----------------------> wav(32-bit) | | | 3. Load 24-bit wav with torchaudio| 4. Load 32-bit wav with scipy | | v v tensor ----------> x <----------- tensor 5. Compare # Underlying assumptions are: # i. Sox properly converts from 24-bit to 32-bit # ii. Loading 32-bit wav file with scipy is correct. """ path = self.get_temp_path('1.original.wav') ref_path = self.get_temp_path('2.reference.wav') # 1. Generate 24-bit signed wav with Sox sox_utils.gen_audio_file(path, sample_rate, num_channels, bit_depth=24, duration=duration) # 2. Convert from 24-bit wav to 32-bit wav with sox sox_utils.convert_audio_file(path, ref_path, bit_depth=32) # 3. Load 24-bit wav with torchaudio data, sr = sox_io_backend.load(path, normalize=normalize) # 4. Load 32-bit wav with scipy data_ref = load_wav(ref_path, normalize=normalize)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=3e-03, rtol=1.3e-06)
def assert_mp3(self, sample_rate, num_channels, bit_rate, duration): """`sox_io_backend.save` can save mp3 format. mp3 encoding introduces delay and boundary effects so we convert the resulting mp3 to wav and compare the results there | | 1. Generate original wav file with SciPy | v -------------- wav ---------------- | | | 2.1. load with scipy | 3.1. Convert to mp3 with Sox | then save with torchaudio | v v mp3 mp3 | | | 2.2. Convert to wav with Sox | 3.2. Convert to wav with Sox | | v v wav wav | | | 2.3. load with scipy | 3.3. load with scipy | | v v tensor -------> compare <--------- tensor """ src_path = self.get_temp_path('1.reference.wav') mp3_path = self.get_temp_path('2.1.torchaudio.mp3') wav_path = self.get_temp_path('2.2.torchaudio.wav') mp3_path_sox = self.get_temp_path('3.1.sox.mp3') wav_path_sox = self.get_temp_path('3.2.sox.wav') # 1. Generate original wav data = get_wav_data('float32', num_channels, normalize=True, num_frames=duration * sample_rate) save_wav(src_path, data, sample_rate) # 2.1. Convert the original wav to mp3 with torchaudio sox_io_backend.save(mp3_path, load_wav(src_path)[0], sample_rate, compression=bit_rate) # 2.2. Convert the mp3 to wav with Sox sox_utils.convert_audio_file(mp3_path, wav_path) # 2.3. Load found = load_wav(wav_path)[0] # 3.1. Convert the original wav to mp3 with SoX sox_utils.convert_audio_file(src_path, mp3_path_sox, compression=bit_rate) # 3.2. Convert the mp3 to wav with Sox sox_utils.convert_audio_file(mp3_path_sox, wav_path_sox) # 3.3. Load expected = load_wav(wav_path_sox)[0] self.assertEqual(found, expected)
def assert_amb(self, dtype, sample_rate, num_channels, duration): """`sox_io_backend.save` can save amb format. This test takes the same strategy as mp3 to compare the result """ src_path = self.get_temp_path('1.reference.wav') amb_path = self.get_temp_path('2.1.torchaudio.amb') wav_path = self.get_temp_path('2.2.torchaudio.wav') amb_path_sox = self.get_temp_path('3.1.sox.amb') wav_path_sox = self.get_temp_path('3.2.sox.wav') # 1. Generate original wav data = get_wav_data(dtype, num_channels, normalize=False, num_frames=duration * sample_rate) save_wav(src_path, data, sample_rate) # 2.1. Convert the original wav to amb with torchaudio sox_io_backend.save(amb_path, load_wav(src_path, normalize=False)[0], sample_rate) # 2.2. Convert the amb to wav with Sox sox_utils.convert_audio_file(amb_path, wav_path) # 2.3. Load found = load_wav(wav_path)[0] # 3.1. Convert the original wav to amb with SoX sox_utils.convert_audio_file(src_path, amb_path_sox) # 3.2. Convert the amb to wav with Sox sox_utils.convert_audio_file(amb_path_sox, wav_path_sox) # 3.3. Load expected = load_wav(wav_path_sox)[0] self.assertEqual(found, expected)
def assert_sphere(self, sample_rate, num_channels, duration): """`sox_io_backend.save` can save sph format. This test takes the same strategy as mp3 to compare the result """ src_path = self.get_temp_path('1.reference.wav') flc_path = self.get_temp_path('2.1.torchaudio.sph') wav_path = self.get_temp_path('2.2.torchaudio.wav') flc_path_sox = self.get_temp_path('3.1.sox.sph') wav_path_sox = self.get_temp_path('3.2.sox.wav') # 1. Generate original wav data = get_wav_data('float32', num_channels, normalize=True, num_frames=duration * sample_rate) save_wav(src_path, data, sample_rate) # 2.1. Convert the original wav to sph with torchaudio sox_io_backend.save(flc_path, load_wav(src_path)[0], sample_rate) # 2.2. Convert the sph to wav with Sox # converting to 32 bit because sph file has 24 bit depth which scipy cannot handle. sox_utils.convert_audio_file(flc_path, wav_path, bit_depth=32) # 2.3. Load found = load_wav(wav_path)[0] # 3.1. Convert the original wav to sph with SoX sox_utils.convert_audio_file(src_path, flc_path_sox) # 3.2. Convert the sph to wav with Sox # converting to 32 bit because sph file has 24 bit depth which scipy cannot handle. sox_utils.convert_audio_file(flc_path_sox, wav_path_sox, bit_depth=32) # 3.3. Load expected = load_wav(wav_path_sox)[0] self.assertEqual(found, expected)
def _assert_vorbis(self, sample_rate, num_channels, quality_level, duration): """`sox_io_backend.save` can save vorbis format. This test takes the same strategy as mp3 to compare the result """ src_path = self.get_temp_path('1.reference.wav') vbs_path = self.get_temp_path('2.1.torchaudio.vorbis') wav_path = self.get_temp_path('2.2.torchaudio.wav') vbs_path_sox = self.get_temp_path('3.1.sox.vorbis') wav_path_sox = self.get_temp_path('3.2.sox.wav') # 1. Generate original wav data = get_wav_data('int16', num_channels, normalize=False, num_frames=duration * sample_rate) save_wav(src_path, data, sample_rate) # 2.1. Convert the original wav to vorbis with torchaudio sox_io_backend.save(vbs_path, load_wav(src_path)[0], sample_rate, compression=quality_level, dtype=None) # 2.2. Convert the vorbis to wav with Sox sox_utils.convert_audio_file(vbs_path, wav_path) # 2.3. Load found = load_wav(wav_path)[0] # 3.1. Convert the original wav to vorbis with SoX sox_utils.convert_audio_file(src_path, vbs_path_sox, compression=quality_level) # 3.2. Convert the vorbis to wav with Sox sox_utils.convert_audio_file(vbs_path_sox, wav_path_sox) # 3.3. Load expected = load_wav(wav_path_sox)[0] # sox's vorbis encoding has some random boundary effect, which cause small number of # samples yields higher descrepency than the others. # so we allow small portions of data to be outside of absolute torelance. # make sure to pass somewhat long duration atol = 1.0e-4 max_failure_allowed = 0.01 # this percent of samples are allowed to outside of atol. failure_ratio = ( (found - expected).abs() > atol).sum().item() / found.numel() if failure_ratio > max_failure_allowed: # it's failed and this will give a better error message. self.assertEqual(found, expected, atol=atol, rtol=1.3e-6)
def assert_format( self, format: str, sample_rate: float, num_channels: int, compression: float = None, bit_depth: int = None, duration: float = 1, normalize: bool = True, encoding: str = None, atol: float = 4e-05, rtol: float = 1.3e-06, ): """`sox_io_backend.load` can load given format correctly. file encodings introduce delay and boundary effects so we create a reference wav file from the original file format x | | 1. Generate given format with Sox | v 2. Convert to wav with Sox given format ----------------------> wav | | | 3. Load with torchaudio | 4. Load with scipy | | v v tensor ----------> x <----------- tensor 5. Compare Underlying assumptions are; i. Conversion of given format to wav with Sox preserves data. ii. Loading wav file with scipy is correct. By combining i & ii, step 2. and 4. allows to load reference given format data without using torchaudio """ path = self.get_temp_path(f'1.original.{format}') ref_path = self.get_temp_path('2.reference.wav') # 1. Generate the given format with sox sox_utils.gen_audio_file( path, sample_rate, num_channels, encoding=encoding, compression=compression, bit_depth=bit_depth, duration=duration, ) # 2. Convert to wav with sox wav_bit_depth = 32 if bit_depth == 24 else None # for 24-bit wav sox_utils.convert_audio_file(path, ref_path, bit_depth=wav_bit_depth) # 3. Load the given format with torchaudio data, sr = sox_io_backend.load(path, normalize=normalize) # 4. Load wav with scipy data_ref = load_wav(ref_path, normalize=normalize)[0] # 5. Compare assert sr == sample_rate self.assertEqual(data, data_ref, atol=atol, rtol=rtol)
def assert_save_consistency( self, format: str, *, compression: float = None, encoding: str = None, bits_per_sample: int = None, sample_rate: float = 8000, num_channels: int = 2, num_frames: float = 3 * 8000, src_dtype: str = 'int32', test_mode: str = "path", ): """`save` function produces file that is comparable with `sox` command To compare that the file produced by `save` function agains the file produced by the equivalent `sox` command, we need to load both files. But there are many formats that cannot be opened with common Python modules (like SciPy). So we use `sox` command to prepare the original data and convert the saved files into a format that SciPy can read (PCM wav). The following diagram illustrates this process. The difference is 2.1. and 3.1. This assumes that - loading data with SciPy preserves the data well. - converting the resulting files into WAV format with `sox` preserve the data well. x | 1. Generate source wav file with SciPy | v -------------- wav ---------------- | | | 2.1. load with scipy | 3.1. Convert to the target | then save it into the target | format depth with sox | format with torchaudio | v v target format target format | | | 2.2. Convert to wav with sox | 3.2. Convert to wav with sox | | v v wav wav | | | 2.3. load with scipy | 3.3. load with scipy | | v v tensor -------> compare <--------- tensor """ cmp_encoding = 'floating-point' cmp_bit_depth = 32 src_path = self.get_temp_path('1.source.wav') tgt_path = self.get_temp_path(f'2.1.torchaudio.{format}') tst_path = self.get_temp_path('2.2.result.wav') sox_path = self.get_temp_path(f'3.1.sox.{format}') ref_path = self.get_temp_path('3.2.ref.wav') # 1. Generate original wav data = get_wav_data(src_dtype, num_channels, normalize=False, num_frames=num_frames) save_wav(src_path, data, sample_rate) # 2.1. Convert the original wav to target format with torchaudio data = load_wav(src_path, normalize=False)[0] if test_mode == "path": sox_io_backend.save(tgt_path, data, sample_rate, compression=compression, encoding=encoding, bits_per_sample=bits_per_sample) elif test_mode == "fileobj": with open(tgt_path, 'bw') as file_: sox_io_backend.save(file_, data, sample_rate, format=format, compression=compression, encoding=encoding, bits_per_sample=bits_per_sample) elif test_mode == "bytesio": file_ = io.BytesIO() sox_io_backend.save(file_, data, sample_rate, format=format, compression=compression, encoding=encoding, bits_per_sample=bits_per_sample) file_.seek(0) with open(tgt_path, 'bw') as f: f.write(file_.read()) else: raise ValueError(f"Unexpected test mode: {test_mode}") # 2.2. Convert the target format to wav with sox sox_utils.convert_audio_file(tgt_path, tst_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth) # 2.3. Load with SciPy found = load_wav(tst_path, normalize=False)[0] # 3.1. Convert the original wav to target format with sox sox_encoding = _get_sox_encoding(encoding) sox_utils.convert_audio_file(src_path, sox_path, compression=compression, encoding=sox_encoding, bit_depth=bits_per_sample) # 3.2. Convert the target format to wav with sox sox_utils.convert_audio_file(sox_path, ref_path, encoding=cmp_encoding, bit_depth=cmp_bit_depth) # 3.3. Load with SciPy expected = load_wav(ref_path, normalize=False)[0] self.assertEqual(found, expected)