def test_batch_MelScale(self): specgram = torch.randn(2, 31, 2786) # Single then transform then batch expected = transforms.MelScale()(specgram).repeat(3, 1, 1, 1) # Batch then transform computed = transforms.MelScale()(specgram.repeat(3, 1, 1, 1)) # shape = (3, 2, 201, 1394) self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape)) self.assertTrue(torch.allclose(computed, expected))
def test_melscale_load_save(self): specgram = torch.ones(1, 1000, 100) melscale_transform = transforms.MelScale() melscale_transform(specgram) melscale_transform_copy = transforms.MelScale(n_stft=1000) melscale_transform_copy.load_state_dict(melscale_transform.state_dict()) fb = melscale_transform.fb fb_copy = melscale_transform_copy.fb self.assertEqual(fb_copy.size(), (1000, 128)) self.assertTrue(torch.allclose(fb, fb_copy))
def test_melscale_unset_weight_warning(self): """Issue a warning if MelScale initialized without a weight As part of the deprecation of lazy intialization behavior (#1510), issue a warning if `n_stft` is not set. """ with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("always") T.MelScale(n_mels=64, sample_rate=8000) assert len(caught_warnings) == 1 with warnings.catch_warnings(record=True) as caught_warnings: warnings.simplefilter("always") T.MelScale(n_mels=64, sample_rate=8000, n_stft=201) assert len(caught_warnings) == 0
def test_MelScale(self): """MelScale transform is comparable to that of librosa""" n_fft = 2048 n_mels = 256 hop_length = n_fft // 4 # Prepare spectrogram input. We use torchaudio to compute one. sound, sample_rate = self._get_sample_data('whitenoise_1min.mp3') spec_ta = F.spectrogram(sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft, hop_length=hop_length, win_length=n_fft, power=2, normalized=False) spec_lr = spec_ta.cpu().numpy().squeeze() # Perform MelScale with torchaudio and librosa melspec_ta = transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_ta) melspec_lr = librosa.feature.melspectrogram(S=spec_lr, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, win_length=n_fft, center=True, window='hann', n_mels=n_mels, htk=True, norm=None) # Note: Using relaxed rtol instead of atol assert torch.allclose(melspec_ta, torch.from_numpy(melspec_lr[None, ...]), rtol=1e-3)
def preprocess(file_path='../DATASETS/LJSpeech-1.1/metadata.csv', root_dir='../DATASETS/LJSpeech-1.1'): with open(file_path, encoding='utf8') as file: data_ = [line.strip().split('|') for line in file] root_dir = root_dir sample_rate = 8000 resample = transforms.Resample(orig_freq=22050, new_freq=sample_rate) spectogram = transforms.Spectrogram(n_fft=1024, hop_length=256) to_mel = transforms.MelScale(n_mels=80, sample_rate=sample_rate, n_stft=1024 // 2 + 1) mel_data = torch.zeros(len(data_), 316, 80) mel_len = torch.empty(len(data_), dtype=torch.int) for idx, data in enumerate(tqdm(data_)): path, text = data[0], data[1] path = f'{root_dir}/wavs/{path}.wav' data, sample_rate = torchaudio.load(path) data = resample(data) data = spectogram(data) data = to_mel(data) data = data.transpose(1, 2).squeeze(0) mel_data[idx, :data.size(0)] = data mel_len[idx] = data.size(0) torch.save(mel_data, f'{root_dir}/mel_data.pt') torch.save(mel_len, f'{root_dir}/mel_len.pt')
def test_melscale(self): sample_rate = 8000 n_fft = 400 n_mels = n_fft // 2 + 1 transform = T.MelScale(sample_rate=sample_rate, n_mels=n_mels) spec = get_spectrogram( get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2), n_fft=n_fft, power=1) self.assert_grad(transform, [spec])
def __init__(self, sample_rate, n_fft, win_length, hop_length, n_mels): super(PhaseFbankCal, self).__init__() self.complexSpec = transforms.Spectrogram(n_fft=n_fft, win_length=win_length, hop_length=hop_length, power=None) self.mel_scale = transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate, n_stft=n_fft // 2 + 1)
def test_mel2(self): top_db = 80. s2db = transforms.AmplitudeToDB('power', top_db) waveform = self.waveform.clone() # (1, 16000) waveform_scaled = self.scale(waveform) # (1, 16000) mel_transform = transforms.MelSpectrogram() # check defaults spectrogram_torch = s2db( mel_transform(waveform_scaled)) # (1, 128, 321) self.assertTrue(spectrogram_torch.dim() == 3) self.assertTrue( spectrogram_torch.ge(spectrogram_torch.max() - top_db).all()) self.assertEqual(spectrogram_torch.size(1), mel_transform.n_mels) # check correctness of filterbank conversion matrix self.assertTrue(mel_transform.mel_scale.fb.sum(1).le(1.).all()) self.assertTrue(mel_transform.mel_scale.fb.sum(1).ge(0.).all()) # check options kwargs = { 'window_fn': torch.hamming_window, 'pad': 10, 'win_length': 500, 'hop_length': 125, 'n_fft': 800, 'n_mels': 50 } mel_transform2 = transforms.MelSpectrogram(**kwargs) spectrogram2_torch = s2db( mel_transform2(waveform_scaled)) # (1, 50, 513) self.assertTrue(spectrogram2_torch.dim() == 3) self.assertTrue( spectrogram_torch.ge(spectrogram_torch.max() - top_db).all()) self.assertEqual(spectrogram2_torch.size(1), mel_transform2.n_mels) self.assertTrue(mel_transform2.mel_scale.fb.sum(1).le(1.).all()) self.assertTrue(mel_transform2.mel_scale.fb.sum(1).ge(0.).all()) # check on multi-channel audio filepath = common_utils.get_asset_path( 'steam-train-whistle-daniel_simon.wav') x_stereo = common_utils.load_wav(filepath)[0] # (2, 278756), 44100 spectrogram_stereo = s2db(mel_transform(x_stereo)) # (2, 128, 1394) self.assertTrue(spectrogram_stereo.dim() == 3) self.assertTrue(spectrogram_stereo.size(0) == 2) self.assertTrue( spectrogram_torch.ge(spectrogram_torch.max() - top_db).all()) self.assertEqual(spectrogram_stereo.size(1), mel_transform.n_mels) # check filterbank matrix creation fb_matrix_transform = transforms.MelScale(n_mels=100, sample_rate=16000, f_min=0., f_max=None, n_stft=400) self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.).all()) self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.).all()) self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
def test_InverseMelScale(self): """InverseMelScale transform is comparable to that of librosa""" n_fft = 2048 n_mels = 256 n_stft = n_fft // 2 + 1 hop_length = n_fft // 4 # Prepare mel spectrogram input. We use torchaudio to compute one. sound, sample_rate = self._get_sample_data( 'steam-train-whistle-daniel_simon.wav', offset=2**10, num_frames=2**14) spec_orig = F.spectrogram( sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft, hop_length=hop_length, win_length=n_fft, power=2, normalized=False) melspec_ta = transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_orig) melspec_lr = melspec_ta.cpu().numpy().squeeze() # Perform InverseMelScale with torch audio and librosa spec_ta = transforms.InverseMelScale( n_stft, n_mels=n_mels, sample_rate=sample_rate)(melspec_ta) spec_lr = librosa.feature.inverse.mel_to_stft( melspec_lr, sr=sample_rate, n_fft=n_fft, power=2.0, htk=True, norm=None) spec_lr = torch.from_numpy(spec_lr[None, ...]) # Align dimensions # librosa does not return power spectrogram while torchaudio returns power spectrogram spec_orig = spec_orig.sqrt() spec_ta = spec_ta.sqrt() threshold = 2.0 # This threshold was choosen empirically, based on the following observation # # torch.dist(spec_lr, spec_ta, p=float('inf')) # >>> tensor(1.9666) # # The spectrograms reconstructed by librosa and torchaudio are not very comparable elementwise. # This is because they use different approximation algorithms and resulting values can live # in different magnitude. (although most of them are very close) # See https://github.com/pytorch/audio/pull/366 for the discussion of the choice of algorithm # See https://github.com/pytorch/audio/pull/448/files#r385747021 for the distribution of P-inf # distance over frequencies. assert torch.allclose(spec_ta, spec_lr, atol=threshold) threshold = 1700.0 # This threshold was choosen empirically, based on the following observations # # torch.dist(spec_orig, spec_ta, p=1) # >>> tensor(1644.3516) # torch.dist(spec_orig, spec_lr, p=1) # >>> tensor(1420.7103) # torch.dist(spec_lr, spec_ta, p=1) # >>> tensor(943.2759) assert torch.dist(spec_orig, spec_ta, p=1) < threshold
def test_mel2(self): top_db = 80. s2db = transforms.SpectrogramToDB("power", top_db) audio_orig = self.sig.clone() # (16000, 1) audio_scaled = transforms.Scale()(audio_orig) # (16000, 1) audio_scaled = transforms.LC2CL()(audio_scaled) # (1, 16000) mel_transform = transforms.MelSpectrogram() # check defaults spectrogram_torch = s2db(mel_transform(audio_scaled)) # (1, 319, 40) self.assertTrue(spectrogram_torch.dim() == 3) self.assertTrue( spectrogram_torch.ge(spectrogram_torch.max() - top_db).all()) self.assertEqual(spectrogram_torch.size(-1), mel_transform.n_mels) # check correctness of filterbank conversion matrix self.assertTrue(mel_transform.fm.fb.sum(1).le(1.).all()) self.assertTrue(mel_transform.fm.fb.sum(1).ge(0.).all()) # check options kwargs = { "window": torch.hamming_window, "pad": 10, "ws": 500, "hop": 125, "n_fft": 800, "n_mels": 50 } mel_transform2 = transforms.MelSpectrogram(**kwargs) spectrogram2_torch = s2db(mel_transform2(audio_scaled)) # (1, 506, 50) self.assertTrue(spectrogram2_torch.dim() == 3) self.assertTrue( spectrogram_torch.ge(spectrogram_torch.max() - top_db).all()) self.assertEqual(spectrogram2_torch.size(-1), mel_transform2.n_mels) self.assertTrue(mel_transform2.fm.fb.sum(1).le(1.).all()) self.assertTrue(mel_transform2.fm.fb.sum(1).ge(0.).all()) # check on multi-channel audio x_stereo, sr_stereo = torchaudio.load(self.test_filepath) spectrogram_stereo = s2db(mel_transform(x_stereo)) self.assertTrue(spectrogram_stereo.dim() == 3) self.assertTrue(spectrogram_stereo.size(0) == 2) self.assertTrue( spectrogram_torch.ge(spectrogram_torch.max() - top_db).all()) self.assertEqual(spectrogram_stereo.size(-1), mel_transform.n_mels) # check filterbank matrix creation fb_matrix_transform = transforms.MelScale(n_mels=100, sr=16000, f_max=None, f_min=0., n_stft=400) self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.).all()) self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.).all()) self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
def test_InverseMelScale(self): """Gauge the quality of InverseMelScale transform. As InverseMelScale is currently implemented with random initialization + iterative optimization, it is not practically possible to assert the difference between the estimated spectrogram and the original spectrogram as a whole. Estimated spectrogram has very huge descrepency locally. Thus in this test we gauge what percentage of elements are bellow certain tolerance. At the moment, the quality of estimated spectrogram is not good. When implementation is changed in a way it makes the quality even worse, this test will fail. """ n_fft = 400 power = 1 n_mels = 64 sample_rate = 8000 n_stft = n_fft // 2 + 1 # Generate reference spectrogram and input mel-scaled spectrogram expected = get_spectrogram(get_whitenoise(sample_rate=sample_rate, duration=1, n_channels=2), n_fft=n_fft, power=power).to(self.device, self.dtype) input = T.MelScale(n_mels=n_mels, sample_rate=sample_rate).to(self.device, self.dtype)(expected) # Run transform transform = T.InverseMelScale(n_stft, n_mels=n_mels, sample_rate=sample_rate).to( self.device, self.dtype) torch.random.manual_seed(0) result = transform(input) # Compare epsilon = 1e-60 relative_diff = torch.abs((result - expected) / (expected + epsilon)) for tol in [1e-1, 1e-3, 1e-5, 1e-10]: print(f"Ratio of relative diff smaller than {tol:e} is " f"{_get_ratio(relative_diff < tol)}") assert _get_ratio(relative_diff < 1e-1) > 0.2 assert _get_ratio(relative_diff < 1e-3) > 5e-3 assert _get_ratio(relative_diff < 1e-5) > 1e-5
def __init__(self, hparams: Hyperparams): super().__init__() self.hparams = hparams self.spectrogram = Spectrogram( self.hparams.n_fft, self.hparams.win_length, self.hparams.hop_length ) self.mel_scale = transforms.MelScale( self.hparams.n_mels, self.hparams.sr, n_stft=self.hparams.n_fft // 2 + 1 )
def test_mel2(self): audio_orig = self.sig.clone() # (16000, 1) audio_scaled = transforms.Scale()(audio_orig) # (16000, 1) audio_scaled = transforms.LC2CL()(audio_scaled) # (1, 16000) mel_transform = transforms.MelSpectrogram() # check defaults spectrogram_torch = mel_transform(audio_scaled) # (1, 319, 40) self.assertTrue(spectrogram_torch.dim() == 3) self.assertTrue(spectrogram_torch.le(0.).all()) self.assertTrue(spectrogram_torch.ge(mel_transform.top_db).all()) self.assertEqual(spectrogram_torch.size(-1), mel_transform.n_mels) # check correctness of filterbank conversion matrix self.assertTrue(mel_transform.fm.fb.sum(1).le(1.).all()) self.assertTrue(mel_transform.fm.fb.sum(1).ge(0.).all()) # check options mel_transform2 = transforms.MelSpectrogram(window=torch.hamming_window, pad=10, ws=500, hop=125, n_fft=800, n_mels=50) spectrogram2_torch = mel_transform2(audio_scaled) # (1, 506, 50) self.assertTrue(spectrogram2_torch.dim() == 3) self.assertTrue(spectrogram2_torch.le(0.).all()) self.assertTrue(spectrogram2_torch.ge(mel_transform.top_db).all()) self.assertEqual(spectrogram2_torch.size(-1), mel_transform2.n_mels) self.assertTrue(mel_transform2.fm.fb.sum(1).le(1.).all()) self.assertTrue(mel_transform2.fm.fb.sum(1).ge(0.).all()) # check on multi-channel audio x_stereo, sr_stereo = torchaudio.load(self.test_filepath) spectrogram_stereo = mel_transform(x_stereo) self.assertTrue(spectrogram_stereo.dim() == 3) self.assertTrue(spectrogram_stereo.size(0) == 2) self.assertTrue(spectrogram_stereo.le(0.).all()) self.assertTrue(spectrogram_stereo.ge(mel_transform.top_db).all()) self.assertEqual(spectrogram_stereo.size(-1), mel_transform.n_mels) # check filterbank matrix creation fb_matrix_transform = transforms.MelScale(n_mels=100, sr=16000, f_max=None, f_min=0., n_stft=400) self.assertTrue(fb_matrix_transform.fb.sum(1).le(1.).all()) self.assertTrue(fb_matrix_transform.fb.sum(1).ge(0.).all()) self.assertEqual(fb_matrix_transform.fb.size(), (400, 100))
def tfm_spectro(ad, sr): # We must reshape signal for torchaudio to generate the spectrogram. ws=512 hop=256 to_db_scale=False n_fft=1024 f_min=0.0 f_max=-80 pad=0 n_mels=128 #mel = transforms.MelSpectrogram(sr, n_mels=n_mels, n_fft=n_fft, hop=hop, f_min=f_min, f_max=f_max, pad=pad)(ad) sp = transforms.Spectrogram()(ad) mel = transforms.MelScale()(sp) #mel = mel.permute(0,2,1) # swap dimension, mostly to look sane to a human. #if to_db_scale: mel = transforms.SpectrogramToDB(stype='magnitude', top_db=f_max)(mel) #mel = mel.detach().numpy() if to_db_scale: mel = 20*torch.log10(mel) return mel
def test_MelScale(self): spec_f = torch.rand((1, 6, 201)) self._assert_consistency(T.MelScale(), spec_f)
def test_MelScale_invalid(self): with self.assertRaises(ValueError): torch.jit.script(T.MelScale())