def forward(self, signal, threshs_file): if self.phi is None: return signal # fft complex_spectrum = torch.stft(signal, n_fft=self.win_length, hop_length=self.hop_length, win_length=self.win_length, window=torch.hamming_window( self.win_length), pad_mode='constant', onesided=True) # mask signal with psychoacoustic thresholds mask = self.get_psycho_mask(complex_spectrum, threshs_file) complex_spectrum_masked = complex_spectrum * mask # ifft signal_out = torch.istft(complex_spectrum_masked, n_fft=self.win_length, hop_length=self.hop_length, win_length=self.win_length, window=torch.hamming_window(self.win_length), onesided=True) return signal_out
def test_pwelch(random_state): from cplxmodule.utils.spectrum import pwelch from scipy.signal import welch # https://www.mathworks.com/help/signal/ref/pwelch.html#btulskp-6 fs = 1000. tt = np.r_[:5 * fs - 1] / fs shape = 2, len(tt) epsilon = random_state.randn(*shape) + 1j * random_state.randn(*shape) np_x = np.cos(2 * np.pi * 100 * tt)[np.newaxis] + epsilon * 0.01 tr_x = torch.tensor(np.stack([np_x.real, np_x.imag], axis=-1)) tr_x.requires_grad = False tr_window = torch.hamming_window(500, periodic=False, dtype=tr_x.dtype) tr_ff, tr_px = pwelch(tr_x, 1, tr_window, fs=fs, scaling="density", n_overlap=300) np_ff, np_px = welch(np_x, fs=fs, axis=-1, window=tr_window.numpy(), nfft=None, nperseg=None, scaling="density", noverlap=300, detrend=False, return_onesided=False) assert torch.allclose(tr_px, torch.from_numpy(np_px)) assert torch.allclose(tr_ff, torch.from_numpy(np_ff)) tr_ff, tr_px = pwelch(tr_x, 1, tr_window, fs=fs, scaling="spectrum", n_overlap=499) np_ff, np_px = welch(np_x, fs=fs, axis=-1, window=tr_window.numpy(), nfft=None, nperseg=None, scaling="spectrum", noverlap=499, detrend=False, return_onesided=False) assert torch.allclose(tr_px, torch.from_numpy(np_px)) assert torch.allclose(tr_ff, torch.from_numpy(np_ff))
def get_spectrogram_feature(self, signal): spectrogram = torch.stft( torch.FloatTensor(signal), self.n_fft, hop_length=self.hop_length, win_length=self.n_fft, window=torch.hamming_window(self.n_fft), center=False, normalized=False, onesided=True ) spectrogram = (spectrogram[:, :, 0].pow(2) + spectrogram[:, :, 1].pow(2)).pow(0.5) spectrogram = np.log1p(spectrogram.numpy()) # Refer to "Sequence to Sequence Learning with Neural Network" paper if self.input_reverse: spectrogram = spectrogram[:, ::-1] spectrogram = torch.FloatTensor(np.ascontiguousarray(np.swapaxes(spectrogram, 0, 1))) else: spectrogram = torch.FloatTensor(spectrogram).transpose(0, 1) if self.normalize: spectrogram -= spectrogram.mean() return spectrogram
def get_window(window_type: str, window_length_in_samp: int, device: Optional[torch.device] = None) -> torch.Tensor: # Increase precision in order to achieve parity with scipy.signal.windows.get_window implementation if window_type == "bartlett": return torch.bartlett_window(window_length_in_samp, periodic=False, dtype=torch.float64, device=device).to(torch.float32) elif window_type == "blackman": return torch.blackman_window(window_length_in_samp, periodic=False, dtype=torch.float64, device=device).to(torch.float32) elif window_type == "hamming": return torch.hamming_window(window_length_in_samp, periodic=False, dtype=torch.float64, device=device).to(torch.float32) elif window_type == "hann": return torch.hann_window(window_length_in_samp, periodic=False, dtype=torch.float64, device=device).to(torch.float32) else: raise ValueError(f"Unknown window type: {window_type}")
def test_hamming_window(random_state=None): from scipy.signal.windows import hamming n_window = 1024 np_window = hamming(n_window, False).astype(np.float64) tr_window = torch.hamming_window(n_window, periodic=True, dtype=torch.float64) assert torch.allclose(tr_window, torch.from_numpy(np_window)) np_window = hamming(n_window, True).astype(np.float64) tr_window = torch.hamming_window(n_window, periodic=False, dtype=torch.float64) assert torch.allclose(tr_window, torch.from_numpy(np_window))
def _feature_window_function(window_type: str, window_size: int, blackman_coeff: float, device: str) -> Tensor: r"""Returns a window function with the given type and size """ if window_type == HANNING: window = torch.hann_window(window_size, periodic=False) elif window_type == HAMMING: window = torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46) elif window_type == POVEY: # like hanning but goes to zero at edges window = torch.hann_window(window_size, periodic=False).pow(0.85) elif window_type == RECTANGULAR: window = torch.ones(window_size) elif window_type == BLACKMAN: a = 2 * math.pi / (window_size - 1) window_function = torch.arange(window_size) # can't use torch.blackman_window as they use different coefficients window = (blackman_coeff - 0.5 * torch.cos(a * window_function) + (0.5 - blackman_coeff) * torch.cos(2 * a * window_function)) else: raise Exception('Invalid window type ' + window_type) return window.to(device)
def overlap_add(mel_frames, nmels): # Hamming windows used for overlap add hamm = torch.hamming_window(160, periodic=False) half_hamm = torch.cat((torch.ones(80), hamm[80:]), dim=0) hamm.unsqueeze_(1) half_hamm.unsqueeze_(1) time = (1 + len(mel_frames)) * 80 recon = torch.zeros((time, nmels)) ind = torch.arange(160) - 80 for i in range(len(mel_frames)): frame = mel_frames[i] if frame.shape[1] != nmels: frame = frame.permute(1, 0) ind += 80 if i == 0: recon[ind, :] += frame * half_hamm elif i == len(mel_frames) - 1: recon[ind, :] += frame * torch.flip(half_hamm, dims=(0, )) else: recon[ind, :] += frame * hamm return recon
def torch_spec2wav(self, spectrogram, phase=None): spectrogram = spectrogram.transpose(2, 1) phase = phase.transpose(2, 1) # denormalise spectrogram S = (torch.clamp(spectrogram, 0.0, 1.0) - 1.0) * -self.min_level_db S = S + self.ref_level_db # db_to_amp stft_matrix = torch.pow(10.0, S * 0.05) # invert phase phase = torch.stack([phase.cos(), phase.sin()], dim=-1).to(dtype=stft_matrix.dtype, device=stft_matrix.device) stft_matrix = stft_matrix.unsqueeze(-1).expand_as(phase) return torchaudio.functional.istft( stft_matrix * torch.exp(phase), self.n_fft, hop_length=self.hop_length, win_length=self.win_length, window=torch.hamming_window( self.win_length, periodic=False, alpha=0.5, beta=0.5).to(device=stft_matrix.device), center=True, normalized=False, onesided=True, length=None)
def get_spectrogram_feature(filepath): (rate, width, sig) = wavio.readwav(filepath) sig = sig.ravel() # sig, _ = librosa.effects.trim(sig.astype(np.float32)) valid_pos = np.where(sig > MIN_SIGNAL_VALUE)[0] sig = sig[valid_pos[0]:valid_pos[-1]] if len(sig) > MAX_SIGNAL_LENGTH: sig = sig[:MAX_SIGNAL_LENGTH] stft = torch.stft(torch.FloatTensor(sig), N_FFT, hop_length=int(0.01*SAMPLE_RATE), win_length=int(0.030*SAMPLE_RATE), window=torch.hamming_window(int(0.030*SAMPLE_RATE)), center=False, normalized=False, onesided=True) stft = (stft[:,:,0].pow(2) + stft[:,:,1].pow(2)).pow(0.5); amag = stft.numpy(); feat = torch.FloatTensor(amag) feat = torch.FloatTensor(feat).transpose(0, 1) return feat
def get_window(name, window_length, squared=False): """ Returns a windowing function. Arguments: ---------- window (str) : name of the window, currently only 'hann' is available window_length (int) : length of the window squared (bool) : if true, square the window Returns: ---------- torch.FloatTensor : window of size `window_length` """ if name == "hann": window = torch.hann_window(window_length) elif name == "hamming": window = torch.hamming_window(window_length) elif name == "blackman": window = torch.blackman_window(window_length) else: raise ValueError("Invalid window name {}".format(name)) if squared: window *= window return window
def sinc_impulse_response(cutoff_frequency, window_size=512, sample_rate=None): """Get a sinc impulse response for a set of low-pass cutoff frequencies. Args: cutoff_frequency: Frequency cutoff for low-pass sinc filter. If the sample_rate is given, cutoff_frequency is in Hertz. If sample_rate is None, cutoff_frequency is normalized ratio (frequency/nyquist) in the range [0, 1.0]. Shape [batch_size, n_time, 1]. window_size: Size of the Hamming window to apply to the impulse. sample_rate: Optionally provide the sample rate. Returns: impulse_response: A series of impulse responses. Shape [batch_size, n_time, (window_size // 2) * 2 + 1]. """ if sample_rate is not None: cutoff_frequency *= 2 / sample_rate half_size = window_size // 2 full_size = half_size * 2 + 1 idx = th.arange(-half_size, half_size + 1, dtype=th.float)[None, None, :] impulse_response = sinc(cutoff_frequency * idx) window = th.hamming_window(full_size).expand_as(impulse_response) impulse_response = window * th.real(impulse_response) return impulse_response / impulse_response.sum(-1, keepdim=True)
def __getitem__(self, idx): noisy_path = self.noisy_WAVs[idx] clean_path = self.clean_dir.joinpath(noisy_path.name.split('+')[0] + '.wav') # get the filename of the clean WAV from the filename of the noisy WAV while True: try: clean_waveform, _ = torchaudio.load(clean_path, normalization=2**15) noisy_waveform, _ = torchaudio.load(noisy_path, normalization=2**15) except (RuntimeError, OSError): continue break assert clean_waveform.shape[0] == 1 and noisy_waveform.shape[0] == 1, 'WAV file is not single channel!' window = torch.hamming_window(self.n_fft) x_stft = torch.stft(noisy_waveform.view(-1), n_fft=self.n_fft, hop_length=self.n_fft // 4, win_length=self.n_fft, window=window) y_stft = torch.stft(clean_waveform.view(-1), n_fft=self.n_fft, hop_length=self.n_fft // 4, win_length=self.n_fft, window=window) x_ps = x_stft.pow(2).sum(-1) x_lps = LogTransform()(x_ps) x_ms = x_ps.sqrt() y_ms = y_stft.pow(2).sum(-1).sqrt() noise_ms = (x_stft - y_stft).pow(2).sum(-1).sqrt() # VAD y_ms_filtered = y_ms[self.VAD_frequencies] y_energy_filtered = y_ms_filtered.pow(2).mean(dim=0) y_energy_filtered_averaged = self.__moving_average(y_energy_filtered) y_peak_energy = y_energy_filtered_averaged.max() VAD = torch.where(y_energy_filtered_averaged > y_peak_energy / 1000, torch.ones_like(y_energy_filtered), torch.zeros_like(y_energy_filtered)) VAD = VAD.bool() # mean normalization frames = [] x_lps = x_lps.transpose(0, 1) # (time, frequency) n_init_frames = self.n_init_frames alpha_feat_init = self.alpha_feat_init alpha_feat = self.alpha_feat for frame_counter, frame_feature in enumerate(x_lps): if frame_counter < n_init_frames: alpha = alpha_feat_init else: alpha = alpha_feat if frame_counter == 0: mu = frame_feature sigmasquare = frame_feature.pow(2) mu = alpha * mu + (1 - alpha) * frame_feature sigmasquare = alpha * sigmasquare + (1 - alpha) * frame_feature.pow(2) sigma = torch.sqrt(torch.clamp(sigmasquare - mu.pow(2), min=1e-12)) # limit for sqrt norm_feature = (frame_feature - mu) / sigma frames.append(norm_feature) x_lps = torch.stack(frames, dim=0).transpose(0, 1) # (frequency, time) if not self.test: return x_lps, x_ms, y_ms, noise_ms, VAD if self.test: return noisy_waveform.view(-1), clean_waveform.view(-1), x_stft, y_stft, x_lps, x_ms, y_ms, VAD
def __init__(self, win_length, hop_length=None, n_fft=None): super().__init__() self.window = torch.hamming_window(win_length) if hop_length is None: hop_length = win_length // 4 if n_fft is None: n_fft = win_length self.hop_length = hop_length self.n_fft = n_fft
def _get_feature(self, signal: np.ndarray) -> np.ndarray: spectrogram = torch.stft( Tensor(signal), self.n_fft, hop_length=self.hop_length, win_length=self.n_fft, window=torch.hamming_window(self.n_fft), center=False, normalized=False, onesided=True ) spectrogram = (spectrogram[:, :, 0].pow(2) + spectrogram[:, :, 1].pow(2)).pow(0.5) spectrogram = np.log1p(spectrogram.numpy()) return spectrogram
def get_torch_spectrogram(filepath, sr=16000, window_size=20, stride=10): r""" get a spectrogram by torch. Args: filepath, n_mels, del_silence, input_reverse, normalize, sr, wiindow_size, stride filepath (str): specific path of audio file sr (int): sample rate window_size (int): window size (ms) stride (int): forwarding size (ms) Returns: spectrogram - **spectrogram** (torch.Tensor): return Spectrogram feature Examples:: Generate mel spectrogram from a time series >>> get_torch_spectrogram(filepath) Tensor([[ 2.891e-07, 2.548e-03, ..., 8.116e-09, 5.633e-09], [ 1.986e-07, 1.162e-02, ..., 9.332e-08, 6.716e-09], ..., [ 3.668e-09, 2.029e-08, ..., 3.208e-09, 2.864e-09], [ 2.561e-10, 2.096e-09, ..., 7.543e-10, 6.101e-10]]) """ if filepath.endswith('.pcm'): try: pcm = np.memmap(filepath, dtype='h', mode='r') except RuntimeError: logger.info('RuntimeError in {0}'.format(filepath)) return None signal = np.array([float(x) for x in pcm]) elif filepath.endswith('.wav'): signal, _ = librosa.core.load(filepath, sr=sr) else: raise ValueError("Unsupported format: {0}".format( filepath.split('.')[-1])) N_FFT = int(sr * 0.001 * window_size) STRIDE = int(sr * 0.001 * stride) spectrogram = torch.stft(torch.FloatTensor(signal), N_FFT, hop_length=STRIDE, win_length=N_FFT, window=torch.hamming_window(N_FFT), center=False, normalized=False, onesided=True) spectrogram = (spectrogram[:, :, 0].pow(2) + spectrogram[:, :, 1].pow(2)).pow(0.5) # (N_FFT / 2 + 1 * T) spectrogram = np.log1p(spectrogram.numpy()) spectrogram = torch.FloatTensor(spectrogram).transpose(0, 1) spectrogram -= spectrogram.mean() return spectrogram
def get_window(window_size, window_type, square_root_window=True): """Return the window""" window = { 'hamming': torch.hamming_window(window_size), 'hanning': torch.hann_window(window_size), }[window_type] if square_root_window: window = torch.sqrt(window) return window
def build_window(fft_size, window_fn='hann'): if window_fn == 'hann': window = torch.hann_window(fft_size, periodic=True) elif window_fn == 'hamming': window = torch.hamming_window(fft_size, periodic=True) else: raise ValueError("Not support {} window.".format(window_fn)) return window
def __call__(self, signal): with torch.no_grad(): torch_signal = torch.Tensor(signal).unsqueeze(0) wave = torch_signal.to(self.device) stft_noisy_mag, stft_noisy_phase = torchaudio.functional.magphase( torch.stft(wave, n_fft=self.nfft, hop_length=self.hop_len, win_length=self.window_len, window=torch.hamming_window(self.window_len).to( self.device))) stft_input_mag = torch.transpose(stft_noisy_mag.clone(), 2, 1).cpu().numpy() stft_input_power = stft_input_mag[0]**2 smallpower = stft_input_power < self.eps stft_input_power[smallpower] = np.log(self.eps) stft_input_power[~smallpower] = np.log( stft_input_power[~smallpower]) stft_input_power = self.norm_function(stft_input_power) stft_input_power = torch.FloatTensor(stft_input_power).to( self.device) mask = self.model(stft_input_power.unsqueeze(0)) enhanced_mag = torch.transpose(mask, 2, 1) * stft_noisy_mag complex_enhanced = torch.zeros( (enhanced_mag.shape[0], enhanced_mag.shape[1], enhanced_mag.shape[2], 2)) complex_enhanced[:, :, :, 0] = enhanced_mag * torch.cos(stft_noisy_phase) complex_enhanced[:, :, :, 1] = enhanced_mag * torch.sin(stft_noisy_phase) enhanced_sig_recon = torch.istft(complex_enhanced, n_fft=self.nfft, hop_length=self.hop_len, win_length=self.window_len, window=torch.hamming_window( self.window_len)) return enhanced_sig_recon
def get_spectrogram_feature(cfg_data, filepath, train_mode=False): use_mel_scale = cfg_data["use_mel_scale"] cfg_spec_augment = cfg_data["spec_augment"] use_specaug = cfg_spec_augment["use"] cfg_trim = cfg_data["trim_silence"] use_trim = cfg_trim["use"] (rate, width, sig) = wavio.readwav(filepath) sig = sig.ravel() if use_trim: sig = trim.trim(sig, cfg_trim) stft = torch.stft(torch.FloatTensor(sig), N_FFT, hop_length=int(0.01 * SAMPLE_RATE), win_length=int(0.030 * SAMPLE_RATE), window=torch.hamming_window(int(0.030 * SAMPLE_RATE)), center=False, normalized=False, onesided=True) stft = (stft[:, :, 0].pow(2) + stft[:, :, 1].pow(2)).pow(0.5) if use_mel_scale: amag = stft.clone().detach() amag = amag.view( -1, amag.shape[0], amag.shape[1] ) # reshape spectrogram shape to [batch_size, time, frequency] mel = melscale_pytorch.mel_scale(amag, sample_rate=SAMPLE_RATE, n_mels=N_FFT // 2 + 1) # melspec with same shape if use_specaug and train_mode: specaug_prob = 1 # augment probability if numpy.random.uniform(0, 1) < specaug_prob: # apply augment mel = spec_augment_pytorch.spec_augment( mel, time_warping_para=80, frequency_masking_para=54, time_masking_para=40, frequency_mask_num=1, time_mask_num=1) feat = mel.view(mel.shape[1], mel.shape[2]) # squeeze back to [frequency, time] feat = feat.transpose(0, 1).clone().detach() del sig, stft, amag, mel else: # use baseline feature amag = stft.numpy() feat = torch.FloatTensor(amag) feat = torch.FloatTensor(feat).transpose(0, 1) del sig, stft, amag return feat
def periodogram(signal, window_fn=torch.hann_window, is_train=False): # not used if window_fn is not None: signal = signal * torch.hamming_window(window_length=signal.size(-1), periodic=False, device=signal.device) if not is_train: with torch.no_grad(): dft = torch.rfft(signal, signal_ndim=1, onesided=True) return torch.pow(dft, 2).sum(-1)
def pre_process(self, data, data_length): # ToDo - write the code for generating pitch features fbank = self.fbank[data.get_device()] pre_emphasis = config.fbank['pre_emphasis'] frame_size = config.fbank['frame_size'] frame_stride = config.fbank['frame_stride'] n_fft = config.fbank['n_fft'] rate = config.fbank['rate'] emphasized_data = torch.zeros_like(data).float() if config.use_cuda: emphasized_data = emphasized_data.to(data.device) emphasized_data[:, 1:] = data[:, 1:] - pre_emphasis * data[:, :-1] emphasized_data[:, 0] = data[:, 0] frame_length, frame_step = frame_size * rate, frame_stride * rate # Convert from seconds to samples frame_length = int(frame_length) frame_step = int(frame_step) mag_frames = torch.norm(torch.stft( emphasized_data, n_fft=n_fft, hop_length=frame_step, win_length=frame_length, window=torch.hamming_window(frame_length).to( emphasized_data.device), pad_mode='constant'), dim=3).transpose(2, 1) pow_frames = ((1.0 / n_fft) * (mag_frames**2)) # Power Spectrum filter_banks = torch.matmul(pow_frames, fbank.transpose(1, 0)) filter_banks[filter_banks == 0] = 2.220446049250313e-16 filter_banks = 20 * torch.log10(filter_banks) # dB filter_banks -= (torch.mean(filter_banks, dim=(0, 1), keepdim=True) + 1e-8) if data_length is None: ilens = (torch.ones([filter_banks.shape[0]]) * filter_banks.shape[1]).long() else: ilens = torch.FloatTensor([ data_length_i // frame_step + 1 for data_length_i in data_length ]).long() # for filter_banks.shape[0] return filter_banks, ilens
def test_linearity_of_istft4(self): # hamming_window, not centered, not normalized, onesided kwargs4 = { 'n_fft': 12, 'window': torch.hamming_window(12), 'center': False, 'pad_mode': 'constant', 'normalized': False, 'onesided': True, } data_size = (2, 7, 3, 2) self._test_linearity_of_istft(data_size, kwargs4, atol=1e-5, rtol=1e-8)
def test_linearity_of_istft3(self): # hamming_window, centered, normalized, not onesided kwargs3 = { 'n_fft': 12, 'window': torch.hamming_window(12), 'center': True, 'pad_mode': 'constant', 'normalized': True, 'onesided': False, } data_size = (2, 12, 7, 2) self._test_linearity_of_istft(data_size, kwargs3)
def spectral_ops(self): a = torch.randn(10) b = torch.randn(10, 8, 4, 2) return ( torch.stft(a, 8), torch.istft(b, 8), torch.bartlett_window(2, dtype=torch.float), torch.blackman_window(2, dtype=torch.float), torch.hamming_window(4, dtype=torch.float), torch.hann_window(4, dtype=torch.float), torch.kaiser_window(4, dtype=torch.float), )
def test_istft_is_inverse_of_stft3(self): # hamming_window, centered, normalized, not onesided kwargs3 = { 'n_fft': 15, 'hop_length': 3, 'win_length': 11, 'window': torch.hamming_window(11), 'center': True, 'pad_mode': 'constant', 'normalized': True, 'onesided': False, } _test_istft_is_inverse_of_stft(kwargs3)
def smooth_upsample2(x, size): batch, channels, frames = x.shape hop_size = size // frames window_size = hop_size * 2 window = torch.hamming_window(window_size, periodic=True).to(x.device) amps = x.view(batch, channels * frames) scaled_windows = amps[..., None] * window[None, None, :] scaled_windows = scaled_windows.view(batch, channels, frames, window_size) output = overlap_add(scaled_windows, apply_window=False) output = output[:, :, :-hop_size] return output
def __init__(self, cutoffs: list, width: int = None): super().__init__() self.cutoffs = cutoffs if width is None: width = int(2 / min(cutoffs)) self.width = width window = torch.hamming_window(2 * width + 1, periodic=False) t = np.arange(-width, width + 1, dtype=np.float32) filters = [] for cutoff in cutoffs: sinc = torch.from_numpy(np.sinc(2 * cutoff * t)) filters.append(2 * cutoff * sinc * window) self.register_buffer("filters", torch.stack(filters).unsqueeze(1))
def get_spectrogram_feature(filepath, train_mode=False): (rate, width, sig) = wavio.readwav(filepath) wavio.writewav24("test.wav", rate=rate, data=sig) sig = sig.ravel() sig = trim(sig) stft = torch.stft(torch.FloatTensor(sig), N_FFT, hop_length=int(0.01 * SAMPLE_RATE), win_length=int(0.030 * SAMPLE_RATE), window=torch.hamming_window(int(0.030 * SAMPLE_RATE)), center=False, normalized=False, onesided=True) stft = (stft[:, :, 0].pow(2) + stft[:, :, 1].pow(2)).pow(0.5) amag = stft.clone().detach() amag = amag.view( -1, amag.shape[0], amag.shape[1] ) # reshape spectrogram shape to [batch_size, time, frequency] mel = melscale_pytorch.mel_scale(amag, sample_rate=SAMPLE_RATE, n_mels=N_FFT // 2 + 1) # melspec with same shape plt.subplot(1, 2, 1) plt.imshow(mel.transpose(1, 2).squeeze(), cmap='jet') p = 1 # always augment randp = np.random.uniform(0, 1) do_aug = p > randp if do_aug & train_mode: # apply augment print("augment image") mel = spec_augment_pytorch.spec_augment(mel, time_warping_para=80, frequency_masking_para=54, time_masking_para=50, frequency_mask_num=1, time_mask_num=1) feat = mel.view(mel.shape[1], mel.shape[2]) # squeeze back to [frequency, time] feat = feat.transpose(0, 1).clone().detach() plt.subplot(1, 2, 2) plt.imshow(feat, cmap='jet') plt.show() # display it del stft, amag, mel return feat
def forward(self, x): """ input: ------ x: tensor(batch, length), where length is waveform length output: ------- lfcc_output: tensor(batch, frame_num, dim_num) """ # pre-emphsis if self.with_emphasis: x[:, 1:] = x[:, 1:] - 0.97 * x[:, 0:-1] # STFT x_stft = torch.stft(x, self.fn, self.fs, self.fl, window=torch.hamming_window(self.fl).to(x.device), onesided=True, pad_mode="constant") # amplitude sp_amp = torch.norm(x_stft, 2, -1).pow(2).permute(0, 2, 1).contiguous() # filter bank fb_feature = torch.log10( torch.matmul(sp_amp, self.lfcc_fb) + torch.finfo(torch.float32).eps) # DCT (if necessary, remove DCT) lfcc = self.l_dct(fb_feature) if not self.flag_for_LFB else fb_feature # Add energy if self.with_energy: power_spec = sp_amp / self.fn energy = torch.log10( power_spec.sum(axis=2) + torch.finfo(torch.float32).eps) lfcc[:, :, 0] = energy # Add delta coefficients if self.with_delta: lfcc_delta = delta(lfcc) lfcc_delta_delta = delta(lfcc_delta) lfcc_output = torch.cat((lfcc, lfcc_delta, lfcc_delta_delta), 2) else: lfcc_output = lfcc # done return lfcc_output
def test_istft_is_inverse_of_stft5(self): # hamming_window, not centered, not normalized, not onesided # window same size as n_fft kwargs5 = { 'n_fft': 3, 'hop_length': 2, 'win_length': 3, 'window': torch.hamming_window(3), 'center': False, 'pad_mode': 'reflect', 'normalized': False, 'onesided': False, } _test_istft_is_inverse_of_stft(kwargs5)
def __init__(self, scale, taps, samplerate): super(SincLayer, self).__init__() self.samplerate = int(samplerate) self.taps = taps self.scale = scale # each filter requires two parameters to define the filter bandwidth filter_parameters = torch.FloatTensor(len(scale), 2) self.linear = nn.Parameter( torch.linspace(-math.pi, math.pi, steps=taps), requires_grad=False) self.window = nn.Parameter( torch.hamming_window(self.taps), requires_grad=False) for i, band in enumerate(scale): start = self.samplerate / band.start_hz stop = self.samplerate / band.stop_hz filter_parameters[i, 0] = start filter_parameters[i, 1] = stop self.filter_parameters = nn.Parameter(filter_parameters)