def test_random_stft_sizes(self): for _ in range(5): nfft = random.randint(128, 2048) window_size = random.randint(128, nfft) hop_size = random.randint(64, window_size) fb_module = FilterbankFeatures( exact_pad=False, pad_to=1, n_fft=nfft, n_window_size=window_size, n_window_stride=hop_size ) audio_length = random.randint(nfft, 2 ** 16) test_1 = torch.randn(1, audio_length) test_1_len = torch.tensor([audio_length]) result, result_len = fb_module(test_1, test_1_len) assert ( result.shape[2] == result_len[0] ), f"{result.shape} != {result_len}: {nfft}, {window_size}, {hop_size}, {audio_length}" spec = librosa.stft( test_1.cpu().detach().numpy().squeeze(), n_fft=nfft, hop_length=hop_size, win_length=window_size ) assert ( spec.shape[1] == result.shape[2] ), f"{result.shape} != {spec.shape}: {nfft}, {window_size}, {hop_size}, {audio_length}" for _ in range(5): nfft = random.randint(128, 2048) window_size = random.randint(128, nfft) hop_size = random.randint(64, window_size) fb_module = FilterbankFeatures( exact_pad=True, pad_to=1, n_fft=nfft, n_window_size=window_size, n_window_stride=hop_size ) audio_length = random.randint(nfft, 2 ** 16) test_1 = torch.randn(1, audio_length) test_1_len = torch.tensor([audio_length]) result, result_len = fb_module(test_1, test_1_len) assert ( result.shape[2] == result_len[0] ), f"{result.shape} != {result_len}: {nfft}, {window_size}, {hop_size}, {audio_length}" test_2 = test_1.cpu().detach().numpy().squeeze() test_2 = np.pad(test_2, int((window_size - hop_size) // 2), mode="reflect") spec = librosa.stft(test_2, n_fft=nfft, hop_length=hop_size, win_length=window_size, center=False,) assert ( spec.shape[1] == result.shape[2] ), f"{result.shape} != {spec.shape}: {nfft}, {window_size}, {hop_size}, {audio_length}"
def test_seq_len(self): fb_module = FilterbankFeatures(exact_pad=False, pad_to=1) test_1 = torch.randn(1, 800) test_1_len = torch.tensor([800]) result, result_len = fb_module(test_1, test_1_len) assert result.shape[2] == result_len[0], f"{result.shape} != {result_len}" spec = librosa.stft(test_1.cpu().detach().numpy().squeeze(), n_fft=512, hop_length=160, win_length=320) assert spec.shape[1] == result.shape[2], f"{result.shape} != {spec.shape}"
def make_preprocessor_trainable(stt): big_dict = { k: v for k, v in stt.preprocessor.featurizer.__dict__.items() if not k.startswith('_') and k != 'forward' } st = stt.preprocessor.featurizer.state_dict() stt.preprocessor.featurizer = FilterbankFeatures(use_grads=True) stt.preprocessor.featurizer.load_state_dict(st) _ = { setattr(stt.preprocessor.featurizer, k, v) for k, v in big_dict.items() } # stt = stt.cuda() return stt
def __init__( self, sample_rate=16000, window_size=0.02, window_stride=0.01, n_window_size=None, n_window_stride=None, window="hann", normalize="per_feature", n_fft=None, preemph=0.97, features=64, lowfreq=0, highfreq=None, log=True, log_zero_guard_type="add", log_zero_guard_value=2 ** -24, dither=1e-5, pad_to=16, frame_splicing=1, stft_exact_pad=False, stft_conv=False, pad_value=0, mag_power=2.0, ): super().__init__(n_window_size, n_window_stride) self._sample_rate = sample_rate if window_size and n_window_size: raise ValueError(f"{self} received both window_size and " f"n_window_size. Only one should be specified.") if window_stride and n_window_stride: raise ValueError( f"{self} received both window_stride and " f"n_window_stride. Only one should be specified." ) if window_size: n_window_size = int(window_size * self._sample_rate) if window_stride: n_window_stride = int(window_stride * self._sample_rate) self.featurizer = FilterbankFeatures( sample_rate=self._sample_rate, n_window_size=n_window_size, n_window_stride=n_window_stride, window=window, normalize=normalize, n_fft=n_fft, preemph=preemph, nfilt=features, lowfreq=lowfreq, highfreq=highfreq, log=log, log_zero_guard_type=log_zero_guard_type, log_zero_guard_value=log_zero_guard_value, dither=dither, pad_to=pad_to, frame_splicing=frame_splicing, stft_exact_pad=stft_exact_pad, stft_conv=stft_conv, pad_value=pad_value, mag_power=mag_power, )