Exemplo n.º 1
0
    def test_griffinlim(self):

        # NOTE: This test is flaky without a fixed random seed
        # See https://github.com/pytorch/audio/issues/382
        torch.random.manual_seed(42)
        tensor = torch.rand((1, 1000))

        n_fft = 400
        ws = 400
        hop = 100
        window = torch.hann_window(ws)
        normalize = False
        momentum = 0.99
        n_iter = 8
        length = 1000
        rand_init = False
        init = 'random' if rand_init else None

        specgram = F.spectrogram(tensor, 0, window, n_fft, hop, ws, 2,
                                 normalize).sqrt()
        ta_out = F.griffinlim(specgram, window, n_fft, hop, ws, 1, normalize,
                              n_iter, momentum, length, rand_init)
        lr_out = librosa.griffinlim(specgram.squeeze(0).numpy(),
                                    n_iter=n_iter,
                                    hop_length=hop,
                                    momentum=momentum,
                                    init=init,
                                    length=length)
        lr_out = torch.from_numpy(lr_out).unsqueeze(0)

        self.assertTrue(torch.allclose(ta_out, lr_out, atol=5e-5))
Exemplo n.º 2
0
def wav_to_stft(
    wav_p: str,
    nperseg: int = constant.N_FFT,
    stride: int = constant.STFT_STRIDE,
) -> th.Tensor:
    raw_audio, sr = th_audio.load(wav_p)

    assert sr == constant.SAMPLE_RATE, \
        f"Audio sample rate must be {constant.SAMPLE_RATE}Hz, " \
        f"file \"{wav_p}\" is {sr}Hz"

    raw_audio_mono = raw_audio.mean(0)

    hann_window = th.hann_window(nperseg)

    complex_values = th_audio_f.spectrogram(raw_audio_mono,
                                            pad=0,
                                            window=hann_window,
                                            n_fft=nperseg,
                                            hop_length=stride,
                                            win_length=nperseg,
                                            power=None,
                                            normalized=True,
                                            return_complex=True)

    # remove Nyquist frequency
    return complex_values[:-1, :]
Exemplo n.º 3
0
    def test_MelScale(self):
        """MelScale transform is comparable to that of librosa"""
        n_fft = 2048
        n_mels = 256
        hop_length = n_fft // 4

        # Prepare spectrogram input. We use torchaudio to compute one.
        sound, sample_rate = self._get_sample_data('whitenoise_1min.mp3')
        spec_ta = F.spectrogram(sound,
                                pad=0,
                                window=torch.hann_window(n_fft),
                                n_fft=n_fft,
                                hop_length=hop_length,
                                win_length=n_fft,
                                power=2,
                                normalized=False)
        spec_lr = spec_ta.cpu().numpy().squeeze()
        # Perform MelScale with torchaudio and librosa
        melspec_ta = transforms.MelScale(n_mels=n_mels,
                                         sample_rate=sample_rate)(spec_ta)
        melspec_lr = librosa.feature.melspectrogram(S=spec_lr,
                                                    sr=sample_rate,
                                                    n_fft=n_fft,
                                                    hop_length=hop_length,
                                                    win_length=n_fft,
                                                    center=True,
                                                    window='hann',
                                                    n_mels=n_mels,
                                                    htk=True,
                                                    norm=None)
        # Note: Using relaxed rtol instead of atol
        assert torch.allclose(melspec_ta,
                              torch.from_numpy(melspec_lr[None, ...]),
                              rtol=1e-3)
 def func(tensor):
     n_fft = 400
     ws = 400
     hop = 200
     pad = 0
     window = torch.hann_window(ws, device=tensor.device, dtype=tensor.dtype)
     power = 2.
     normalize = False
     return F.spectrogram(tensor, pad, window, n_fft, hop, ws, power, normalize)
Exemplo n.º 5
0
def file_log_spectrogram(sound, segment_time=20, overlap_time=10):
    r"""Generates a spectrogram of a given sound file.
    """
    waveform, fs = torchaudio.load(sound)
    nperseg = int(segment_time * fs / 1000)  # TODO: do not hardcode these
    noverlap = int(overlap_time * fs / 1000)
    cur_input = torch.log(
        F.spectrogram(waveform, 0, torch.hann_window(nperseg), nperseg,
                      nperseg - noverlap, nperseg, 2, 0) + 1e-10)
    return torch.squeeze(torch.transpose(cur_input, 1, 2))
Exemplo n.º 6
0
    def forward(self, waveform: Tensor) -> Tensor:
        r"""
        Args:
            waveform (Tensor): Tensor of audio of dimension (..., time).

        Returns:
            Tensor: Dimension (..., freq, time), where freq is
            ``n_fft // 2 + 1`` where ``n_fft`` is the number of
            Fourier bins, and time is the number of window hops (n_frame).
        """
        return F.spectrogram(waveform, self.pad, self.window, self.n_fft, self.hop_length,
                             self.win_length, self.power, self.normalized)
Exemplo n.º 7
0
    def test_InverseMelScale(self):
        """InverseMelScale transform is comparable to that of librosa"""
        n_fft = 2048
        n_mels = 256
        n_stft = n_fft // 2 + 1
        hop_length = n_fft // 4

        # Prepare mel spectrogram input. We use torchaudio to compute one.
        sound, sample_rate = _load_audio_asset(
            'steam-train-whistle-daniel_simon.wav', offset=2**10, num_frames=2**14)
        sound = sound.mean(dim=0, keepdim=True)
        spec_orig = F.spectrogram(
            sound, pad=0, window=torch.hann_window(n_fft), n_fft=n_fft,
            hop_length=hop_length, win_length=n_fft, power=2, normalized=False)
        melspec_ta = torchaudio.transforms.MelScale(n_mels=n_mels, sample_rate=sample_rate)(spec_orig)
        melspec_lr = melspec_ta.cpu().numpy().squeeze()
        # Perform InverseMelScale with torch audio and librosa
        spec_ta = torchaudio.transforms.InverseMelScale(
            n_stft, n_mels=n_mels, sample_rate=sample_rate)(melspec_ta)
        spec_lr = librosa.feature.inverse.mel_to_stft(
            melspec_lr, sr=sample_rate, n_fft=n_fft, power=2.0, htk=True, norm=None)
        spec_lr = torch.from_numpy(spec_lr[None, ...])

        # Align dimensions
        # librosa does not return power spectrogram while torchaudio returns power spectrogram
        spec_orig = spec_orig.sqrt()
        spec_ta = spec_ta.sqrt()

        threshold = 2.0
        # This threshold was choosen empirically, based on the following observation
        #
        # torch.dist(spec_lr, spec_ta, p=float('inf'))
        # >>> tensor(1.9666)
        #
        # The spectrograms reconstructed by librosa and torchaudio are not comparable elementwise.
        # This is because they use different approximation algorithms and resulting values can live
        # in different magnitude. (although most of them are very close)
        # See
        # https://github.com/pytorch/audio/pull/366 for the discussion of the choice of algorithm
        # https://github.com/pytorch/audio/pull/448/files#r385747021 for the distribution of P-inf
        # distance over frequencies.
        assert torch.allclose(spec_ta, spec_lr, atol=threshold)

        threshold = 1700.0
        # This threshold was choosen empirically, based on the following observations
        #
        # torch.dist(spec_orig, spec_ta, p=1)
        # >>> tensor(1644.3516)
        # torch.dist(spec_orig, spec_lr, p=1)
        # >>> tensor(1420.7103)
        # torch.dist(spec_lr, spec_ta, p=1)
        # >>> tensor(943.2759)
        assert torch.dist(spec_orig, spec_ta, p=1) < threshold
Exemplo n.º 8
0
    def forward(self, waveform):
        r"""
        Args:
            waveform (torch.Tensor): Tensor of audio of dimension (channel, time)

        Returns:
            torch.Tensor: Dimension (channel, freq, time), where channel
            is unchanged, freq is ``n_fft // 2 + 1`` where ``n_fft`` is the number of
            Fourier bins, and time is the number of window hops (n_frames).
        """
        return F.spectrogram(waveform, self.pad, self.window, self.n_fft,
                             self.hop_length, self.win_length, self.power,
                             self.normalized)
Exemplo n.º 9
0
    def test_grad_at_zero(self, power):
        """The gradient of power spectrogram should not be nan but zero near x=0

        https://github.com/pytorch/audio/issues/993
        """
        x = torch.zeros(1, 22050, requires_grad=True)
        spec = F.spectrogram(
            x,
            pad=0,
            window=None,
            n_fft=2048,
            hop_length=None,
            win_length=None,
            power=power,
            normalized=False,
        )
        spec.sum().backward()
        assert not x.grad.isnan().sum()
Exemplo n.º 10
0
    def test_torchscript_spectrogram(self):
        @torch.jit.script
        def jit_method(sig, pad, window, n_fft, hop, ws, power, normalize):
            # type: (Tensor, int, Tensor, int, int, int, int, bool) -> Tensor
            return F.spectrogram(sig, pad, window, n_fft, hop, ws, power, normalize)

        tensor = torch.rand((1, 1000))
        n_fft = 400
        ws = 400
        hop = 200
        pad = 0
        window = torch.hann_window(ws)
        power = 2
        normalize = False

        jit_out = jit_method(tensor, pad, window, n_fft, hop, ws, power, normalize)
        py_out = F.spectrogram(tensor, pad, window, n_fft, hop, ws, power, normalize)

        self.assertTrue(torch.allclose(jit_out, py_out))
Exemplo n.º 11
0
    def test_MelScale(self):
        """MelScale transform is comparable to that of librosa"""
        n_fft = 2048
        n_mels = 256
        hop_length = n_fft // 4

        # Prepare spectrogram input. We use torchaudio to compute one.
        common_utils.set_audio_backend('default')
        sound, sample_rate = _load_audio_asset('whitenoise_1min.mp3')
        sound = sound.mean(dim=0, keepdim=True)
        spec_ta = F.spectrogram(sound,
                                pad=0,
                                window=torch.hann_window(n_fft),
                                n_fft=n_fft,
                                hop_length=hop_length,
                                win_length=n_fft,
                                power=2,
                                normalized=False)
        spec_lr = spec_ta.cpu().numpy().squeeze()
        # Perform MelScale with torchaudio and librosa
        melspec_ta = torchaudio.transforms.MelScale(
            n_mels=n_mels, sample_rate=sample_rate)(spec_ta)
        melspec_lr = librosa.feature.melspectrogram(S=spec_lr,
                                                    sr=sample_rate,
                                                    n_fft=n_fft,
                                                    hop_length=hop_length,
                                                    win_length=n_fft,
                                                    center=True,
                                                    window='hann',
                                                    n_mels=n_mels,
                                                    htk=True,
                                                    norm=None)
        # Note: Using relaxed rtol instead of atol
        self.assertEqual(melspec_ta,
                         torch.from_numpy(melspec_lr[None, ...]),
                         atol=1e-8,
                         rtol=1e-3)
Exemplo n.º 12
0
 def jit_method(sig, pad, window, n_fft, hop, ws, power, normalize):
     # type: (Tensor, int, Tensor, int, int, int, int, bool) -> Tensor
     return F.spectrogram(sig, pad, window, n_fft, hop, ws, power, normalize)